This commit is contained in:
Kar
2026-03-09 16:07:04 +05:30
parent 466775c51c
commit 760aaae6c5
3 changed files with 49 additions and 40 deletions

View File

@@ -2,3 +2,7 @@ python -m venv venv
source venv/bin/activate source venv/bin/activate
pip install -r requirements.txt pip install -r requirements.txt
python save_iana_domains.py python save_iana_domains.py
# This will create a data/ directory with:
# - Individual JSON files for each domain (e.g., data/com.json)
# - Summary file with all domains (data/all_domains.json)

View File

@@ -1,3 +1,2 @@
pymongo==4.6.0
requests==2.31.0 requests==2.31.0
beautifulsoup4==4.12.2 beautifulsoup4==4.12.2

View File

@@ -1,14 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Script to fetch IANA root domain database and save to MongoDB collection 'extensions' Script to fetch IANA root domain database and save as JSON files
""" """
import pymongo
import re import re
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin from urllib.parse import urljoin
import time import time
import json
import os
from datetime import datetime from datetime import datetime
def fetch_domain_details(extension, url): def fetch_domain_details(extension, url):
@@ -246,16 +247,13 @@ def get_domain_type_and_manager(soup, extension, details):
return domain_type, manager return domain_type, manager
def main(): def main():
# MongoDB connection parameters # Create data directory if it doesn't exist
mongo_uri = "mongodb://l2:27017/iana" data_dir = 'data'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
print(f"Created {data_dir} directory")
try: try:
# Connect to MongoDB
print("Connecting to MongoDB...")
client = pymongo.MongoClient(mongo_uri)
db = client.get_database()
collection = db.extensions
# Fetch domain list from IANA # Fetch domain list from IANA
domains = fetch_iana_domains() domains = fetch_iana_domains()
@@ -283,7 +281,7 @@ def main():
print(f" - Domain Type: {domain_type}") print(f" - Domain Type: {domain_type}")
print(f" - TLD Manager: {manager}") print(f" - TLD Manager: {manager}")
# Create document for MongoDB # Create document for JSON
document = { document = {
'extension': extension, 'extension': extension,
'url': url, 'url': url,
@@ -296,47 +294,55 @@ def main():
'record_updated': details['record_updated'], 'record_updated': details['record_updated'],
'registration_date': details['registration_date'], 'registration_date': details['registration_date'],
'iana_reports': details['iana_reports'], 'iana_reports': details['iana_reports'],
'last_fetched': datetime.utcnow(),
'fetched_at': datetime.now().isoformat() 'fetched_at': datetime.now().isoformat()
} }
# Upsert to MongoDB (update if exists, insert if new) # Save as individual JSON file
collection.update_one( filename = os.path.join(data_dir, f"{extension}.json")
{'extension': extension}, with open(filename, 'w', encoding='utf-8') as f:
{'$set': document}, json.dump(document, f, indent=2, ensure_ascii=False)
upsert=True
)
processed_count += 1 processed_count += 1
print(f" ✓ Saved {extension} to MongoDB") print(f" ✓ Saved {extension}.json")
# Add a 3-second delay to be respectful to the server # Add a 3-second delay to be respectful to the server
time.sleep(3) time.sleep(3)
# Verify insertion # Create a summary JSON file with all domains
total_count = collection.count_documents({}) summary_file = os.path.join(data_dir, 'all_domains.json')
print(f"\n✅ Total extensions in MongoDB: {total_count}") summary_data = {
print(f"✅ Processed {processed_count} domains this run") 'total_domains': total_domains,
'processed_domains': processed_count,
'fetched_at': datetime.now().isoformat(),
'domains': []
}
# Show sample data # Add basic info for all domains
print("\nSample data:") for domain in domains:
sample_docs = collection.find().limit(5) summary_data['domains'].append({
for doc in sample_docs: 'extension': domain['extension'],
print(f" {doc['extension']}: {doc.get('type', 'N/A')} - {doc.get('manager', 'N/A')}") 'type': domain.get('type'),
'manager': domain.get('manager'),
'url': domain['url']
})
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary_data, f, indent=2, ensure_ascii=False)
print(f"\n✅ Total extensions processed: {processed_count}")
print(f"✅ Individual JSON files saved in {data_dir}/")
print(f"✅ Summary file saved as {summary_file}")
# Show statistics by type # Show statistics by type
type_stats = list(collection.aggregate([ type_stats = {}
{'$match': {'type': {'$ne': None}}}, for domain in domains:
{'$group': {'_id': '$type', 'count': {'$sum': 1}}}, dtype = domain.get('type')
{'$sort': {'count': -1}} if dtype:
])) type_stats[dtype] = type_stats.get(dtype, 0) + 1
print(f"\nDomain types:") print(f"\nDomain types:")
for stat in type_stats: for dtype, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
print(f" {stat['_id']}: {stat['count']}") print(f" {dtype}: {count}")
client.close()
print("\nMongoDB connection closed.")
except Exception as e: except Exception as e:
print(f"Error: {e}") print(f"Error: {e}")