This commit is contained in:
Kar
2026-03-09 16:03:13 +05:30
commit 466775c51c
4 changed files with 357 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
venv
*.pyc
__pycache__
*.log
data/

4
readme.md Normal file
View File

@@ -0,0 +1,4 @@
python -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python save_iana_domains.py

3
requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
pymongo==4.6.0
requests==2.31.0
beautifulsoup4==4.12.2

345
save_iana_domains.py Normal file
View File

@@ -0,0 +1,345 @@
#!/usr/bin/env python3
"""
Script to fetch IANA root domain database and save to MongoDB collection 'extensions'
"""
import pymongo
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
from datetime import datetime
def fetch_domain_details(extension, url):
"""Fetch detailed information for a specific domain extension"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
details = {
'name_servers': [],
'whois_server': None,
'rdap_server': None,
'registration_url': None,
'record_updated': None,
'registration_date': None,
'iana_reports': []
}
# Extract information from the Registry Information section
content_text = soup.get_text()
# Find WHOIS Server
whois_match = re.search(r'WHOIS Server:\s*(\S+)', content_text)
if whois_match:
details['whois_server'] = whois_match.group(1)
# Find RDAP Server
rdap_match = re.search(r'RDAP Server:\s*(\S+)', content_text)
if rdap_match:
details['rdap_server'] = rdap_match.group(1)
# Find registration URL
reg_url_match = re.search(r'URL for registration services:\s*\[([^\]]+)\]', content_text)
if reg_url_match:
details['registration_url'] = reg_url_match.group(1)
# Find dates
updated_match = re.search(r'Record last updated\s+(\d{4}-\d{2}-\d{2})', content_text)
if updated_match:
details['record_updated'] = updated_match.group(1)
reg_date_match = re.search(r'Registration date\s+(\d{4}-\d{2}-\d{2})', content_text)
if reg_date_match:
details['registration_date'] = reg_date_match.group(1)
# Find IANA Reports
report_links = soup.find_all('a', href=re.compile(r'/reports/'))
for link in report_links:
details['iana_reports'].append({
'title': link.get_text().strip(),
'url': urljoin(url, link['href'])
})
# Look for name servers in tables and other formats
name_servers = []
# First try to find name servers in tables
tables = soup.find_all('table')
for table in tables:
rows = table.find_all('tr')
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) >= 1:
# Check if first cell looks like a name server
first_cell = cells[0].get_text().strip()
if '.' in first_cell and not first_cell.startswith('http'):
# This might be a name server
name_servers.append(first_cell)
# If no tables found, try text patterns
if not name_servers:
# Look for name servers in text content
ns_patterns = [
r'([a-zA-Z0-9.-]+\.nic\.[a-zA-Z]{2,})', # *.nic.tld pattern
r'([a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s+[0-9.]+)', # hostname + IP pattern
r'(ns[0-9]*\.[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # ns*.domain.tld pattern
r'([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\s+[0-9.]+\s+[0-9a-fA-F:]+' # hostname + IPv4 + IPv6
]
for pattern in ns_patterns:
matches = re.findall(pattern, content_text, re.IGNORECASE)
for match in matches:
if isinstance(match, tuple):
# If regex returns groups, take the first one
ns_name = match[0] if match[0] else match[1] if len(match) > 1 else ''
else:
ns_name = match
if ns_name and '.' in ns_name and len(ns_name) > 5:
name_servers.append(ns_name.strip())
# Also try to find name servers in pre/code blocks
pre_blocks = soup.find_all(['pre', 'code'])
for block in pre_blocks:
block_text = block.get_text().strip()
lines = block_text.split('\n')
for line in lines:
# Look for lines that contain server names
if re.match(r'^[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', line.strip()):
parts = line.strip().split('\t')
if parts:
name_servers.append(parts[0].strip())
# Remove duplicates and clean up
details['name_servers'] = list(set([ns for ns in name_servers if ns and len(ns) > 3]))
print(f" - WHOIS: {details['whois_server']}")
print(f" - RDAP: {details['rdap_server']}")
print(f" - Registration URL: {details['registration_url']}")
print(f" - Name Servers: {details['name_servers']}")
return details
except Exception as e:
print(f"Error fetching details for {extension}: {e}")
return {
'name_servers': [],
'whois_server': None,
'rdap_server': None,
'registration_url': None,
'record_updated': None,
'registration_date': None,
'iana_reports': []
}
def fetch_iana_domains():
"""Fetch all domain extensions from IANA root database"""
base_url = "https://www.iana.org/domains/root/db"
print("Fetching IANA root domain database...")
response = requests.get(base_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find the main table with domain information
domains = []
# Look for the first table that contains domain data
tables = soup.find_all('table')
main_table = None
for table in tables:
rows = table.find_all('tr')
if len(rows) > 1: # Check if table has data rows
# Check if first row contains our expected headers
first_row = rows[0]
headers = [cell.get_text().strip() for cell in first_row.find_all(['th', 'td'])]
if 'Domain' in headers and 'Type' in headers and 'TLD Manager' in headers:
main_table = table
break
if not main_table:
print("Could not find the main domain table!")
return domains
# Extract data from the main table
rows = main_table.find_all('tr')[1:] # Skip header row
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) >= 3:
domain_text = cells[0].get_text().strip()
domain_type = cells[1].get_text().strip()
manager = cells[2].get_text().strip()
# Remove leading dot from domain
if domain_text.startswith('.'):
domain_text = domain_text[1:]
if domain_text and len(domain_text) > 0:
url = urljoin(base_url, f"/domains/root/db/{domain_text}.html")
domains.append({
'extension': domain_text,
'url': url,
'type': domain_type if domain_type else None,
'manager': manager if manager else None
})
print(f"Found {len(domains)} domain extensions from main table")
return domains
def get_domain_type_and_manager(soup, extension, details):
"""Extract domain type and TLD manager from the individual domain page"""
# Look for domain type in parentheses
type_pattern = re.compile(r'\(([^)]*top-level domain[^)]*)\)')
# Search in all text content
content = soup.get_text()
type_match = type_pattern.search(content)
domain_type = type_match.group(1) if type_match else None
# Extract TLD Manager from Sponsoring Organisation
manager = None
# Look for "Sponsoring Organisation" header and get the next bold text
h2_tags = soup.find_all('h2')
for h2 in h2_tags:
if 'Sponsoring Organisation' in h2.get_text():
# Get the next bold tag after this h2
next_bold = h2.find_next('b')
if next_bold:
manager = next_bold.get_text().strip()
break
# If no manager found from sponsoring org, try registration URL
if not manager and details.get('registration_url'):
reg_url = details['registration_url']
# Extract domain name as manager
url_match = re.search(r'https?://(?:www\.)?([^/]+)', reg_url)
if url_match:
manager = url_match.group(1)
else:
# If it's just a URL without protocol, use as is
manager = reg_url.replace('http://', '').replace('https://', '').replace('www.', '')
# If still no manager found, try other patterns
if not manager:
registry_patterns = [
r'Registry[:\s]+([^\n]+)',
r'Sponsor[:\s]+([^\n]+)',
r'Manager[:\s]+([^\n]+)',
r'Organization[:\s]+([^\n]+)'
]
for pattern in registry_patterns:
match = re.search(pattern, content, re.IGNORECASE)
if match:
manager = match.group(1).strip()
break
return domain_type, manager
def main():
# MongoDB connection parameters
mongo_uri = "mongodb://l2:27017/iana"
try:
# Connect to MongoDB
print("Connecting to MongoDB...")
client = pymongo.MongoClient(mongo_uri)
db = client.get_database()
collection = db.extensions
# Fetch domain list from IANA
domains = fetch_iana_domains()
if not domains:
print("No domains found. Exiting.")
return
total_domains = len(domains)
processed_count = 0
# Process each domain
for i, domain in enumerate(domains, 1):
extension = domain['extension']
url = domain['url']
print(f"\nProcessing {extension} ({i}/{total_domains})...")
# Fetch detailed information (name servers, WHOIS, RDAP, etc.)
details = fetch_domain_details(extension, url)
# Use pre-extracted type and manager from main table
domain_type = domain.get('type')
manager = domain.get('manager')
print(f" - Domain Type: {domain_type}")
print(f" - TLD Manager: {manager}")
# Create document for MongoDB
document = {
'extension': extension,
'url': url,
'type': domain_type,
'manager': manager,
'whois_server': details['whois_server'],
'rdap_server': details['rdap_server'],
'registration_url': details['registration_url'],
'name_servers': details['name_servers'],
'record_updated': details['record_updated'],
'registration_date': details['registration_date'],
'iana_reports': details['iana_reports'],
'last_fetched': datetime.utcnow(),
'fetched_at': datetime.now().isoformat()
}
# Upsert to MongoDB (update if exists, insert if new)
collection.update_one(
{'extension': extension},
{'$set': document},
upsert=True
)
processed_count += 1
print(f" ✓ Saved {extension} to MongoDB")
# Add a 3-second delay to be respectful to the server
time.sleep(3)
# Verify insertion
total_count = collection.count_documents({})
print(f"\n✅ Total extensions in MongoDB: {total_count}")
print(f"✅ Processed {processed_count} domains this run")
# Show sample data
print("\nSample data:")
sample_docs = collection.find().limit(5)
for doc in sample_docs:
print(f" {doc['extension']}: {doc.get('type', 'N/A')} - {doc.get('manager', 'N/A')}")
# Show statistics by type
type_stats = list(collection.aggregate([
{'$match': {'type': {'$ne': None}}},
{'$group': {'_id': '$type', 'count': {'$sum': 1}}},
{'$sort': {'count': -1}}
]))
print(f"\nDomain types:")
for stat in type_stats:
print(f" {stat['_id']}: {stat['count']}")
client.close()
print("\nMongoDB connection closed.")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()