This commit is contained in:
Kar
2026-03-09 16:07:04 +05:30
parent 466775c51c
commit 760aaae6c5
3 changed files with 49 additions and 40 deletions

View File

@@ -1,4 +1,8 @@
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python save_iana_domains.py
python save_iana_domains.py
# This will create a data/ directory with:
# - Individual JSON files for each domain (e.g., data/com.json)
# - Summary file with all domains (data/all_domains.json)

View File

@@ -1,3 +1,2 @@
pymongo==4.6.0
requests==2.31.0
beautifulsoup4==4.12.2

View File

@@ -1,14 +1,15 @@
#!/usr/bin/env python3
"""
Script to fetch IANA root domain database and save to MongoDB collection 'extensions'
Script to fetch IANA root domain database and save as JSON files
"""
import pymongo
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import json
import os
from datetime import datetime
def fetch_domain_details(extension, url):
@@ -246,16 +247,13 @@ def get_domain_type_and_manager(soup, extension, details):
return domain_type, manager
def main():
# MongoDB connection parameters
mongo_uri = "mongodb://l2:27017/iana"
# Create data directory if it doesn't exist
data_dir = 'data'
if not os.path.exists(data_dir):
os.makedirs(data_dir)
print(f"Created {data_dir} directory")
try:
# Connect to MongoDB
print("Connecting to MongoDB...")
client = pymongo.MongoClient(mongo_uri)
db = client.get_database()
collection = db.extensions
# Fetch domain list from IANA
domains = fetch_iana_domains()
@@ -283,7 +281,7 @@ def main():
print(f" - Domain Type: {domain_type}")
print(f" - TLD Manager: {manager}")
# Create document for MongoDB
# Create document for JSON
document = {
'extension': extension,
'url': url,
@@ -296,47 +294,55 @@ def main():
'record_updated': details['record_updated'],
'registration_date': details['registration_date'],
'iana_reports': details['iana_reports'],
'last_fetched': datetime.utcnow(),
'fetched_at': datetime.now().isoformat()
}
# Upsert to MongoDB (update if exists, insert if new)
collection.update_one(
{'extension': extension},
{'$set': document},
upsert=True
)
# Save as individual JSON file
filename = os.path.join(data_dir, f"{extension}.json")
with open(filename, 'w', encoding='utf-8') as f:
json.dump(document, f, indent=2, ensure_ascii=False)
processed_count += 1
print(f" ✓ Saved {extension} to MongoDB")
print(f" ✓ Saved {extension}.json")
# Add a 3-second delay to be respectful to the server
time.sleep(3)
# Verify insertion
total_count = collection.count_documents({})
print(f"\n✅ Total extensions in MongoDB: {total_count}")
print(f"✅ Processed {processed_count} domains this run")
# Create a summary JSON file with all domains
summary_file = os.path.join(data_dir, 'all_domains.json')
summary_data = {
'total_domains': total_domains,
'processed_domains': processed_count,
'fetched_at': datetime.now().isoformat(),
'domains': []
}
# Show sample data
print("\nSample data:")
sample_docs = collection.find().limit(5)
for doc in sample_docs:
print(f" {doc['extension']}: {doc.get('type', 'N/A')} - {doc.get('manager', 'N/A')}")
# Add basic info for all domains
for domain in domains:
summary_data['domains'].append({
'extension': domain['extension'],
'type': domain.get('type'),
'manager': domain.get('manager'),
'url': domain['url']
})
with open(summary_file, 'w', encoding='utf-8') as f:
json.dump(summary_data, f, indent=2, ensure_ascii=False)
print(f"\n✅ Total extensions processed: {processed_count}")
print(f"✅ Individual JSON files saved in {data_dir}/")
print(f"✅ Summary file saved as {summary_file}")
# Show statistics by type
type_stats = list(collection.aggregate([
{'$match': {'type': {'$ne': None}}},
{'$group': {'_id': '$type', 'count': {'$sum': 1}}},
{'$sort': {'count': -1}}
]))
type_stats = {}
for domain in domains:
dtype = domain.get('type')
if dtype:
type_stats[dtype] = type_stats.get(dtype, 0) + 1
print(f"\nDomain types:")
for stat in type_stats:
print(f" {stat['_id']}: {stat['count']}")
client.close()
print("\nMongoDB connection closed.")
for dtype, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
print(f" {dtype}: {count}")
except Exception as e:
print(f"Error: {e}")