json
This commit is contained in:
@@ -2,3 +2,7 @@ python -m venv venv
|
|||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
python save_iana_domains.py
|
python save_iana_domains.py
|
||||||
|
|
||||||
|
# This will create a data/ directory with:
|
||||||
|
# - Individual JSON files for each domain (e.g., data/com.json)
|
||||||
|
# - Summary file with all domains (data/all_domains.json)
|
||||||
@@ -1,3 +1,2 @@
|
|||||||
pymongo==4.6.0
|
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
beautifulsoup4==4.12.2
|
beautifulsoup4==4.12.2
|
||||||
|
|||||||
@@ -1,14 +1,15 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Script to fetch IANA root domain database and save to MongoDB collection 'extensions'
|
Script to fetch IANA root domain database and save as JSON files
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pymongo
|
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
import time
|
import time
|
||||||
|
import json
|
||||||
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
def fetch_domain_details(extension, url):
|
def fetch_domain_details(extension, url):
|
||||||
@@ -246,16 +247,13 @@ def get_domain_type_and_manager(soup, extension, details):
|
|||||||
return domain_type, manager
|
return domain_type, manager
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# MongoDB connection parameters
|
# Create data directory if it doesn't exist
|
||||||
mongo_uri = "mongodb://l2:27017/iana"
|
data_dir = 'data'
|
||||||
|
if not os.path.exists(data_dir):
|
||||||
|
os.makedirs(data_dir)
|
||||||
|
print(f"Created {data_dir} directory")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Connect to MongoDB
|
|
||||||
print("Connecting to MongoDB...")
|
|
||||||
client = pymongo.MongoClient(mongo_uri)
|
|
||||||
db = client.get_database()
|
|
||||||
collection = db.extensions
|
|
||||||
|
|
||||||
# Fetch domain list from IANA
|
# Fetch domain list from IANA
|
||||||
domains = fetch_iana_domains()
|
domains = fetch_iana_domains()
|
||||||
|
|
||||||
@@ -283,7 +281,7 @@ def main():
|
|||||||
print(f" - Domain Type: {domain_type}")
|
print(f" - Domain Type: {domain_type}")
|
||||||
print(f" - TLD Manager: {manager}")
|
print(f" - TLD Manager: {manager}")
|
||||||
|
|
||||||
# Create document for MongoDB
|
# Create document for JSON
|
||||||
document = {
|
document = {
|
||||||
'extension': extension,
|
'extension': extension,
|
||||||
'url': url,
|
'url': url,
|
||||||
@@ -296,47 +294,55 @@ def main():
|
|||||||
'record_updated': details['record_updated'],
|
'record_updated': details['record_updated'],
|
||||||
'registration_date': details['registration_date'],
|
'registration_date': details['registration_date'],
|
||||||
'iana_reports': details['iana_reports'],
|
'iana_reports': details['iana_reports'],
|
||||||
'last_fetched': datetime.utcnow(),
|
|
||||||
'fetched_at': datetime.now().isoformat()
|
'fetched_at': datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Upsert to MongoDB (update if exists, insert if new)
|
# Save as individual JSON file
|
||||||
collection.update_one(
|
filename = os.path.join(data_dir, f"{extension}.json")
|
||||||
{'extension': extension},
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
{'$set': document},
|
json.dump(document, f, indent=2, ensure_ascii=False)
|
||||||
upsert=True
|
|
||||||
)
|
|
||||||
|
|
||||||
processed_count += 1
|
processed_count += 1
|
||||||
print(f" ✓ Saved {extension} to MongoDB")
|
print(f" ✓ Saved {extension}.json")
|
||||||
|
|
||||||
# Add a 3-second delay to be respectful to the server
|
# Add a 3-second delay to be respectful to the server
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
# Verify insertion
|
# Create a summary JSON file with all domains
|
||||||
total_count = collection.count_documents({})
|
summary_file = os.path.join(data_dir, 'all_domains.json')
|
||||||
print(f"\n✅ Total extensions in MongoDB: {total_count}")
|
summary_data = {
|
||||||
print(f"✅ Processed {processed_count} domains this run")
|
'total_domains': total_domains,
|
||||||
|
'processed_domains': processed_count,
|
||||||
|
'fetched_at': datetime.now().isoformat(),
|
||||||
|
'domains': []
|
||||||
|
}
|
||||||
|
|
||||||
# Show sample data
|
# Add basic info for all domains
|
||||||
print("\nSample data:")
|
for domain in domains:
|
||||||
sample_docs = collection.find().limit(5)
|
summary_data['domains'].append({
|
||||||
for doc in sample_docs:
|
'extension': domain['extension'],
|
||||||
print(f" {doc['extension']}: {doc.get('type', 'N/A')} - {doc.get('manager', 'N/A')}")
|
'type': domain.get('type'),
|
||||||
|
'manager': domain.get('manager'),
|
||||||
|
'url': domain['url']
|
||||||
|
})
|
||||||
|
|
||||||
|
with open(summary_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(summary_data, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"\n✅ Total extensions processed: {processed_count}")
|
||||||
|
print(f"✅ Individual JSON files saved in {data_dir}/")
|
||||||
|
print(f"✅ Summary file saved as {summary_file}")
|
||||||
|
|
||||||
# Show statistics by type
|
# Show statistics by type
|
||||||
type_stats = list(collection.aggregate([
|
type_stats = {}
|
||||||
{'$match': {'type': {'$ne': None}}},
|
for domain in domains:
|
||||||
{'$group': {'_id': '$type', 'count': {'$sum': 1}}},
|
dtype = domain.get('type')
|
||||||
{'$sort': {'count': -1}}
|
if dtype:
|
||||||
]))
|
type_stats[dtype] = type_stats.get(dtype, 0) + 1
|
||||||
|
|
||||||
print(f"\nDomain types:")
|
print(f"\nDomain types:")
|
||||||
for stat in type_stats:
|
for dtype, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
|
||||||
print(f" {stat['_id']}: {stat['count']}")
|
print(f" {dtype}: {count}")
|
||||||
|
|
||||||
client.close()
|
|
||||||
print("\nMongoDB connection closed.")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error: {e}")
|
print(f"Error: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user