#!/usr/bin/env python3 """ Script to fetch domain zone data from IANA root zone database """ import requests from bs4 import BeautifulSoup import sys import re def fetch_domain_zone_data(): """Fetch domain zone data from IANA root zone database""" try: # Add headers to mimic a browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get('https://www.iana.org/domains/root/db', headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Find the main table with domain data domains = [] # Look for table rows containing domain information # The IANA page typically has a table with domain, type, and manager information table = soup.find('table') if table: rows = table.find_all('tr') for row in rows[1:]: # Skip header row cells = row.find_all('td') if len(cells) >= 3: domain = cells[0].get_text(strip=True).lower() domain_type = cells[1].get_text(strip=True) tld_manager = cells[2].get_text(strip=True) # Clean up the data domain = re.sub(r'[^a-z0-9.-]', '', domain) domain_type = re.sub(r'\s+', ' ', domain_type) tld_manager = re.sub(r'\s+', ' ', tld_manager) if domain and domain != '.': domains.append({ 'domain': domain, 'type': domain_type, 'tld_manager': tld_manager }) # If table approach doesn't work, try alternative parsing if not domains: # Look for domain links in the page domain_links = soup.find_all('a', href=re.compile(r'/domains/root/db/')) for link in domain_links: domain_text = link.get_text(strip=True).lower() if domain_text and len(domain_text) > 1 and not domain_text.startswith('.'): # Try to get more info by following the link detail_url = f"https://www.iana.org{link['href']}" detail_data = fetch_domain_detail(detail_url) if detail_data: domains.append(detail_data) print(f"Fetched {len(domains)} domains from IANA root zone database") return domains except requests.RequestException as e: print(f"Error fetching IANA root zone data: {e}") return None except Exception as e: print(f"Error parsing IANA data: {e}") return None def fetch_domain_detail(detail_url): """Fetch detailed information for a specific domain""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(detail_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract domain name from URL or page domain = detail_url.split('/')[-1].lower() # Look for type and manager information domain_type = "generic" # default tld_manager = "Unknown" # Try to find type information type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE)) for element in type_elements: parent = element.parent if parent: next_sibling = parent.find_next_sibling() or parent.find_next() if next_sibling: domain_type = next_sibling.get_text(strip=True) break # Try to find manager information manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE)) for element in manager_elements: parent = element.parent if parent: next_sibling = parent.find_next_sibling() or parent.find_next() if next_sibling: tld_manager = next_sibling.get_text(strip=True) break return { 'domain': domain, 'type': domain_type, 'tld_manager': tld_manager } except Exception as e: print(f"Error fetching detail for {detail_url}: {e}") return None def main(): print("Fetching IANA root zone database data...") domains = fetch_domain_zone_data() if domains: print(f"\nSample data:") for i, domain in enumerate(domains[:10]): print(f"{i+1}. {domain['domain']} - {domain['type']} - {domain['tld_manager']}") print(f"\nTotal domains fetched: {len(domains)}") return domains else: print("Failed to fetch domain data") return None if __name__ == "__main__": main()