This commit is contained in:
Kar
2026-03-11 23:08:57 +05:30
parent 26e70981ee
commit 95aab950da
6 changed files with 892 additions and 1 deletions

140
fetch_domain_zone.py Normal file
View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Script to fetch domain zone data from IANA root zone database
"""
import requests
from bs4 import BeautifulSoup
import sys
import re
def fetch_domain_zone_data():
"""Fetch domain zone data from IANA root zone database"""
try:
# Add headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get('https://www.iana.org/domains/root/db', headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Find the main table with domain data
domains = []
# Look for table rows containing domain information
# The IANA page typically has a table with domain, type, and manager information
table = soup.find('table')
if table:
rows = table.find_all('tr')
for row in rows[1:]: # Skip header row
cells = row.find_all('td')
if len(cells) >= 3:
domain = cells[0].get_text(strip=True).lower()
domain_type = cells[1].get_text(strip=True)
tld_manager = cells[2].get_text(strip=True)
# Clean up the data
domain = re.sub(r'[^a-z0-9.-]', '', domain)
domain_type = re.sub(r'\s+', ' ', domain_type)
tld_manager = re.sub(r'\s+', ' ', tld_manager)
if domain and domain != '.':
domains.append({
'domain': domain,
'type': domain_type,
'tld_manager': tld_manager
})
# If table approach doesn't work, try alternative parsing
if not domains:
# Look for domain links in the page
domain_links = soup.find_all('a', href=re.compile(r'/domains/root/db/'))
for link in domain_links:
domain_text = link.get_text(strip=True).lower()
if domain_text and len(domain_text) > 1 and not domain_text.startswith('.'):
# Try to get more info by following the link
detail_url = f"https://www.iana.org{link['href']}"
detail_data = fetch_domain_detail(detail_url)
if detail_data:
domains.append(detail_data)
print(f"Fetched {len(domains)} domains from IANA root zone database")
return domains
except requests.RequestException as e:
print(f"Error fetching IANA root zone data: {e}")
return None
except Exception as e:
print(f"Error parsing IANA data: {e}")
return None
def fetch_domain_detail(detail_url):
"""Fetch detailed information for a specific domain"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(detail_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract domain name from URL or page
domain = detail_url.split('/')[-1].lower()
# Look for type and manager information
domain_type = "generic" # default
tld_manager = "Unknown"
# Try to find type information
type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE))
for element in type_elements:
parent = element.parent
if parent:
next_sibling = parent.find_next_sibling() or parent.find_next()
if next_sibling:
domain_type = next_sibling.get_text(strip=True)
break
# Try to find manager information
manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE))
for element in manager_elements:
parent = element.parent
if parent:
next_sibling = parent.find_next_sibling() or parent.find_next()
if next_sibling:
tld_manager = next_sibling.get_text(strip=True)
break
return {
'domain': domain,
'type': domain_type,
'tld_manager': tld_manager
}
except Exception as e:
print(f"Error fetching detail for {detail_url}: {e}")
return None
def main():
print("Fetching IANA root zone database data...")
domains = fetch_domain_zone_data()
if domains:
print(f"\nSample data:")
for i, domain in enumerate(domains[:10]):
print(f"{i+1}. {domain['domain']} - {domain['type']} - {domain['tld_manager']}")
print(f"\nTotal domains fetched: {len(domains)}")
return domains
else:
print("Failed to fetch domain data")
return None
if __name__ == "__main__":
main()