#!/usr/bin/env python3 """ Script to update domain_zone table with additional data fields """ import mysql.connector import requests import json import sys import time from bs4 import BeautifulSoup import re import idna # Database connection configuration DB_CONFIG = { 'host': 'l2', 'port': 3306, 'user': 'root', 'password': None, # Will be set from command line or input 'database': 'sp_spider', 'charset': 'utf8mb4', 'ssl_disabled': True, 'auth_plugin': 'mysql_native_password' } def get_iana_registry_info(domain): """Get registry information from IANA for a domain""" try: # Remove dot prefix if present clean_domain = domain.lstrip('.') # Try to access the IANA detail page url = f"https://www.iana.org/domains/root/db/{clean_domain}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') info = { 'registration_url': None, 'whois_server': None, 'rdap_server': None, 'name_servers': [], 'sponsoring': None, 'administrative': None, 'technical': None } # Extract registration URL reg_links = soup.find_all('a', href=re.compile(r'registrar|registry', re.IGNORECASE)) if reg_links: info['registration_url'] = reg_links[0].get('href') # Extract WHOIS server whois_text = soup.find(text=re.compile(r'WHOIS|whois', re.IGNORECASE)) if whois_text: parent = whois_text.parent if parent and parent.name == 'td': next_td = parent.find_next_sibling('td') if next_td: info['whois_server'] = next_td.get_text(strip=True) # Extract RDAP server rdap_text = soup.find(text=re.compile(r'RDAP|rdap', re.IGNORECASE)) if rdap_text: parent = rdap_text.parent if parent and parent.name == 'td': next_td = parent.find_next_sibling('td') if next_td: info['rdap_server'] = next_td.get_text(strip=True) # Extract name servers ns_text = soup.find(text=re.compile(r'name.?server|ns', re.IGNORECASE)) if ns_text: parent = ns_text.parent if parent and parent.name == 'td': next_td = parent.find_next_sibling('td') if next_td: ns_list = next_td.get_text(strip=True).split(',') info['name_servers'] = [ns.strip() for ns in ns_list if ns.strip()] # Extract sponsoring organization sponsor_text = soup.find(text=re.compile(r'sponsor|registry|manager', re.IGNORECASE)) if sponsor_text: parent = sponsor_text.parent if parent and parent.name == 'td': next_td = parent.find_next_sibling('td') if next_td: info['sponsoring'] = next_td.get_text(strip=True) # Extract administrative contact admin_text = soup.find(text=re.compile(r'administrative|admin', re.IGNORECASE)) if admin_text: parent = admin_text.parent if parent and parent.name == 'td': next_td = parent.find_next_sibling('td') if next_td: info['administrative'] = next_td.get_text(strip=True) # Extract technical contact tech_text = soup.find(text=re.compile(r'technical|tech', re.IGNORECASE)) if tech_text: parent = tech_text.parent if parent and parent.name == 'td': next_td = parent.find_next_sibling('td') if next_td: info['technical'] = next_td.get_text(strip=True) return info except Exception as e: print(f"Error fetching IANA info for {domain}: {e}") return None def get_rdap_info(domain): """Get RDAP information for a domain""" try: # Remove dot prefix if present clean_domain = domain.lstrip('.') # Try common RDAP servers rdap_servers = [ f"https://rdap.org/domain/{clean_domain}", f"https://data.iana.org/rdap/{clean_domain}", f"https://rdap.verisign.com/com/v1/domain/{clean_domain}", f"https://rdap.nic.fr/domain/{clean_domain}" ] for rdap_url in rdap_servers: try: response = requests.get(rdap_url, timeout=5) if response.status_code == 200: data = response.json() info = { 'rdap_server': rdap_url.split('/')[2], # Extract server domain 'name_servers': [], 'port43': None } # Extract name servers from RDAP data if 'nameservers' in data: info['name_servers'] = [ns.get('ldhName', '') for ns in data['nameservers'] if ns.get('ldhName')] # Extract WHOIS (port43) server if 'port43' in data: info['port43'] = data['port43'].get('server', '') return info except Exception: continue return None except Exception as e: print(f"Error fetching RDAP info for {domain}: {e}") return None def get_dns_servers(domain): """Get DNS servers using DNS resolution""" try: import dns.resolver clean_domain = domain.lstrip('.') # Try to get NS records answers = dns.resolver.resolve(clean_domain, 'NS') name_servers = [str(rdata) for rdata in answers] return {'name_servers': name_servers} except Exception: return None def update_domain_data(): """Update domain_zone table with additional data""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() # Get domains that need data enrichment cursor.execute(""" SELECT id, domain, root_utf FROM domain_zone WHERE removed = FALSE AND (registration_url IS NULL OR whois_server IS NULL OR rdap_server IS NULL) ORDER BY id LIMIT 50 """) domains_to_update = cursor.fetchall() if not domains_to_update: print("All domains already have complete data!") return True print(f"Updating data for {len(domains_to_update)} domains...") for domain_id, domain, root_utf in domains_to_update: print(f"Processing {domain}...") # Get IANA registry information iana_info = get_iana_registry_info(domain) # Get RDAP information rdap_info = get_rdap_info(domain) # Get DNS servers dns_info = get_dns_servers(domain) # Merge information update_data = { 'registration_url': iana_info.get('registration_url') if iana_info else None, 'whois_server': iana_info.get('whois_server') if iana_info else None, 'rdap_server': rdap_info.get('rdap_server') if rdap_info else None, 'name_servers': json.dumps(rdap_info.get('name_servers', []) if rdap_info else (dns_info.get('name_servers', []) if dns_info else [])), 'sponsoring': iana_info.get('sponsoring') if iana_info else None, 'administrative': iana_info.get('administrative') if iana_info else None, 'technical': iana_info.get('technical') if iana_info else None } # Update database cursor.execute(""" UPDATE domain_zone SET registration_url = %s, whois_server = %s, rdap_server = %s, name_servers = %s, sponsoring = %s, administrative = %s, technical = %s, updated_at = CURRENT_TIMESTAMP WHERE id = %s """, ( update_data['registration_url'], update_data['whois_server'], update_data['rdap_server'], update_data['name_servers'], update_data['sponsoring'], update_data['administrative'], update_data['technical'], domain_id )) conn.commit() # Rate limiting time.sleep(1) print(f"Updated {len(domains_to_update)} domains successfully") # Show statistics cursor.execute(""" SELECT COUNT(*) as total, COUNT(CASE WHEN registration_url IS NOT NULL THEN 1 END) as with_reg_url, COUNT(CASE WHEN whois_server IS NOT NULL THEN 1 END) as with_whois, COUNT(CASE WHEN rdap_server IS NOT NULL THEN 1 END) as with_rdap, COUNT(CASE WHEN name_servers IS NOT NULL THEN 1 END) as with_ns FROM domain_zone WHERE removed = FALSE """) stats = cursor.fetchone() print(f"\nDatabase Statistics:") print(f" Total domains: {stats[0]}") print(f" With registration URL: {stats[1]}") print(f" With WHOIS server: {stats[2]}") print(f" With RDAP server: {stats[3]}") print(f" With name servers: {stats[4]}") return True except mysql.connector.Error as e: print(f"Database error: {e}") return False finally: if 'conn' in locals() and conn.is_connected(): cursor.close() conn.close() def show_sample_data(): """Show sample data from domain_zone table""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() print("\n=== Sample enriched data from domain_zone table ===") cursor.execute(""" SELECT id, domain, root_utf, whois_server, rdap_server, JSON_LENGTH(name_servers) as ns_count, sponsoring FROM domain_zone WHERE removed = FALSE AND (registration_url IS NOT NULL OR whois_server IS NOT NULL) ORDER BY id LIMIT 10 """) for row in cursor.fetchall(): domain_display = row[1] if row[1].startswith('xn--') and row[2]: domain_display = f"{row[1]} ({row[2]})" print(f"{row[0]} {domain_display}") print(f" WHOIS: {row[3] or 'N/A'}") print(f" RDAP: {row[4] or 'N/A'}") print(f" Name Servers: {row[5] or 0}") print(f" Sponsor: {row[6] or 'N/A'}") print() cursor.close() conn.close() except mysql.connector.Error as e: print(f"Database error: {e}") def main(): import getpass # Get password from command line argument or prompt if len(sys.argv) > 1: password = sys.argv[1] else: password = getpass.getpass("Enter MariaDB password for user 'root': ") DB_CONFIG['password'] = password print("Starting domain_zone data enrichment process...") # Update domain data if not update_domain_data(): print("Failed to update domain data") sys.exit(1) # Show sample data show_sample_data() print("\n=== Domain data enrichment completed ===") if __name__ == "__main__": main()