From 760aaae6c50457a3bf8f11a6f346c3b0ad662752 Mon Sep 17 00:00:00 2001 From: "Kar@k5" Date: Mon, 9 Mar 2026 16:07:04 +0530 Subject: [PATCH] json --- readme.md | 6 +++- requirements.txt | 1 - save_iana_domains.py | 82 ++++++++++++++++++++++++-------------------- 3 files changed, 49 insertions(+), 40 deletions(-) diff --git a/readme.md b/readme.md index b223a7e..42b57ae 100644 --- a/readme.md +++ b/readme.md @@ -1,4 +1,8 @@ python -m venv venv source venv/bin/activate pip install -r requirements.txt -python save_iana_domains.py \ No newline at end of file +python save_iana_domains.py + +# This will create a data/ directory with: +# - Individual JSON files for each domain (e.g., data/com.json) +# - Summary file with all domains (data/all_domains.json) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 3b0eaf1..f14b614 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ -pymongo==4.6.0 requests==2.31.0 beautifulsoup4==4.12.2 diff --git a/save_iana_domains.py b/save_iana_domains.py index e6ced38..cff6fb7 100644 --- a/save_iana_domains.py +++ b/save_iana_domains.py @@ -1,14 +1,15 @@ #!/usr/bin/env python3 """ -Script to fetch IANA root domain database and save to MongoDB collection 'extensions' +Script to fetch IANA root domain database and save as JSON files """ -import pymongo import re import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import time +import json +import os from datetime import datetime def fetch_domain_details(extension, url): @@ -246,16 +247,13 @@ def get_domain_type_and_manager(soup, extension, details): return domain_type, manager def main(): - # MongoDB connection parameters - mongo_uri = "mongodb://l2:27017/iana" + # Create data directory if it doesn't exist + data_dir = 'data' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + print(f"Created {data_dir} directory") try: - # Connect to MongoDB - print("Connecting to MongoDB...") - client = pymongo.MongoClient(mongo_uri) - db = client.get_database() - collection = db.extensions - # Fetch domain list from IANA domains = fetch_iana_domains() @@ -283,7 +281,7 @@ def main(): print(f" - Domain Type: {domain_type}") print(f" - TLD Manager: {manager}") - # Create document for MongoDB + # Create document for JSON document = { 'extension': extension, 'url': url, @@ -296,47 +294,55 @@ def main(): 'record_updated': details['record_updated'], 'registration_date': details['registration_date'], 'iana_reports': details['iana_reports'], - 'last_fetched': datetime.utcnow(), 'fetched_at': datetime.now().isoformat() } - # Upsert to MongoDB (update if exists, insert if new) - collection.update_one( - {'extension': extension}, - {'$set': document}, - upsert=True - ) + # Save as individual JSON file + filename = os.path.join(data_dir, f"{extension}.json") + with open(filename, 'w', encoding='utf-8') as f: + json.dump(document, f, indent=2, ensure_ascii=False) processed_count += 1 - print(f" ✓ Saved {extension} to MongoDB") + print(f" ✓ Saved {extension}.json") # Add a 3-second delay to be respectful to the server time.sleep(3) - # Verify insertion - total_count = collection.count_documents({}) - print(f"\n✅ Total extensions in MongoDB: {total_count}") - print(f"✅ Processed {processed_count} domains this run") + # Create a summary JSON file with all domains + summary_file = os.path.join(data_dir, 'all_domains.json') + summary_data = { + 'total_domains': total_domains, + 'processed_domains': processed_count, + 'fetched_at': datetime.now().isoformat(), + 'domains': [] + } - # Show sample data - print("\nSample data:") - sample_docs = collection.find().limit(5) - for doc in sample_docs: - print(f" {doc['extension']}: {doc.get('type', 'N/A')} - {doc.get('manager', 'N/A')}") + # Add basic info for all domains + for domain in domains: + summary_data['domains'].append({ + 'extension': domain['extension'], + 'type': domain.get('type'), + 'manager': domain.get('manager'), + 'url': domain['url'] + }) + + with open(summary_file, 'w', encoding='utf-8') as f: + json.dump(summary_data, f, indent=2, ensure_ascii=False) + + print(f"\n✅ Total extensions processed: {processed_count}") + print(f"✅ Individual JSON files saved in {data_dir}/") + print(f"✅ Summary file saved as {summary_file}") # Show statistics by type - type_stats = list(collection.aggregate([ - {'$match': {'type': {'$ne': None}}}, - {'$group': {'_id': '$type', 'count': {'$sum': 1}}}, - {'$sort': {'count': -1}} - ])) + type_stats = {} + for domain in domains: + dtype = domain.get('type') + if dtype: + type_stats[dtype] = type_stats.get(dtype, 0) + 1 print(f"\nDomain types:") - for stat in type_stats: - print(f" {stat['_id']}: {stat['count']}") - - client.close() - print("\nMongoDB connection closed.") + for dtype, count in sorted(type_stats.items(), key=lambda x: x[1], reverse=True): + print(f" {dtype}: {count}") except Exception as e: print(f"Error: {e}")