json
This commit is contained in:
@@ -1,4 +1,8 @@
|
||||
python -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
python save_iana_domains.py
|
||||
python save_iana_domains.py
|
||||
|
||||
# This will create a data/ directory with:
|
||||
# - Individual JSON files for each domain (e.g., data/com.json)
|
||||
# - Summary file with all domains (data/all_domains.json)
|
||||
@@ -1,3 +1,2 @@
|
||||
pymongo==4.6.0
|
||||
requests==2.31.0
|
||||
beautifulsoup4==4.12.2
|
||||
|
||||
@@ -1,14 +1,15 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to fetch IANA root domain database and save to MongoDB collection 'extensions'
|
||||
Script to fetch IANA root domain database and save as JSON files
|
||||
"""
|
||||
|
||||
import pymongo
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
def fetch_domain_details(extension, url):
|
||||
@@ -246,16 +247,13 @@ def get_domain_type_and_manager(soup, extension, details):
|
||||
return domain_type, manager
|
||||
|
||||
def main():
|
||||
# MongoDB connection parameters
|
||||
mongo_uri = "mongodb://l2:27017/iana"
|
||||
# Create data directory if it doesn't exist
|
||||
data_dir = 'data'
|
||||
if not os.path.exists(data_dir):
|
||||
os.makedirs(data_dir)
|
||||
print(f"Created {data_dir} directory")
|
||||
|
||||
try:
|
||||
# Connect to MongoDB
|
||||
print("Connecting to MongoDB...")
|
||||
client = pymongo.MongoClient(mongo_uri)
|
||||
db = client.get_database()
|
||||
collection = db.extensions
|
||||
|
||||
# Fetch domain list from IANA
|
||||
domains = fetch_iana_domains()
|
||||
|
||||
@@ -283,7 +281,7 @@ def main():
|
||||
print(f" - Domain Type: {domain_type}")
|
||||
print(f" - TLD Manager: {manager}")
|
||||
|
||||
# Create document for MongoDB
|
||||
# Create document for JSON
|
||||
document = {
|
||||
'extension': extension,
|
||||
'url': url,
|
||||
@@ -296,47 +294,55 @@ def main():
|
||||
'record_updated': details['record_updated'],
|
||||
'registration_date': details['registration_date'],
|
||||
'iana_reports': details['iana_reports'],
|
||||
'last_fetched': datetime.utcnow(),
|
||||
'fetched_at': datetime.now().isoformat()
|
||||
}
|
||||
|
||||
# Upsert to MongoDB (update if exists, insert if new)
|
||||
collection.update_one(
|
||||
{'extension': extension},
|
||||
{'$set': document},
|
||||
upsert=True
|
||||
)
|
||||
# Save as individual JSON file
|
||||
filename = os.path.join(data_dir, f"{extension}.json")
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(document, f, indent=2, ensure_ascii=False)
|
||||
|
||||
processed_count += 1
|
||||
print(f" ✓ Saved {extension} to MongoDB")
|
||||
print(f" ✓ Saved {extension}.json")
|
||||
|
||||
# Add a 3-second delay to be respectful to the server
|
||||
time.sleep(3)
|
||||
|
||||
# Verify insertion
|
||||
total_count = collection.count_documents({})
|
||||
print(f"\n✅ Total extensions in MongoDB: {total_count}")
|
||||
print(f"✅ Processed {processed_count} domains this run")
|
||||
# Create a summary JSON file with all domains
|
||||
summary_file = os.path.join(data_dir, 'all_domains.json')
|
||||
summary_data = {
|
||||
'total_domains': total_domains,
|
||||
'processed_domains': processed_count,
|
||||
'fetched_at': datetime.now().isoformat(),
|
||||
'domains': []
|
||||
}
|
||||
|
||||
# Show sample data
|
||||
print("\nSample data:")
|
||||
sample_docs = collection.find().limit(5)
|
||||
for doc in sample_docs:
|
||||
print(f" {doc['extension']}: {doc.get('type', 'N/A')} - {doc.get('manager', 'N/A')}")
|
||||
# Add basic info for all domains
|
||||
for domain in domains:
|
||||
summary_data['domains'].append({
|
||||
'extension': domain['extension'],
|
||||
'type': domain.get('type'),
|
||||
'manager': domain.get('manager'),
|
||||
'url': domain['url']
|
||||
})
|
||||
|
||||
with open(summary_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(summary_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n✅ Total extensions processed: {processed_count}")
|
||||
print(f"✅ Individual JSON files saved in {data_dir}/")
|
||||
print(f"✅ Summary file saved as {summary_file}")
|
||||
|
||||
# Show statistics by type
|
||||
type_stats = list(collection.aggregate([
|
||||
{'$match': {'type': {'$ne': None}}},
|
||||
{'$group': {'_id': '$type', 'count': {'$sum': 1}}},
|
||||
{'$sort': {'count': -1}}
|
||||
]))
|
||||
type_stats = {}
|
||||
for domain in domains:
|
||||
dtype = domain.get('type')
|
||||
if dtype:
|
||||
type_stats[dtype] = type_stats.get(dtype, 0) + 1
|
||||
|
||||
print(f"\nDomain types:")
|
||||
for stat in type_stats:
|
||||
print(f" {stat['_id']}: {stat['count']}")
|
||||
|
||||
client.close()
|
||||
print("\nMongoDB connection closed.")
|
||||
for dtype, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" {dtype}: {count}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
Reference in New Issue
Block a user