#!/usr/bin/env python3 """ Script to update domain_root and domain_suffix tables with soft delete functionality """ import mysql.connector import requests import sys from datetime import datetime from bs4 import BeautifulSoup import re import idna # Database connection configuration DB_CONFIG = { 'host': 'l2', 'port': 3306, 'user': 'root', 'password': None, # Will be set from command line or input 'database': 'sp_spider', 'charset': 'utf8mb4', 'ssl_disabled': True, 'auth_plugin': 'mysql_native_password' } # URLs for data sources IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt' PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat' IANA_ROOT_ZONE_URL = 'https://www.iana.org/domains/root/db' def fetch_tld_data(): """Fetch TLD data from IANA""" try: response = requests.get(IANA_TLD_URL) response.raise_for_status() lines = response.text.strip().split('\n') tlds = [] for line in lines: line = line.strip() if line and not line.startswith('#'): tlds.append(line.lower()) return tlds except requests.RequestException as e: print(f"Error fetching TLD data: {e}") return None def fetch_psl_data(): """Fetch Public Suffix List data""" try: response = requests.get(PSL_URL) response.raise_for_status() lines = response.text.strip().split('\n') suffixes = [] for line in lines: line = line.strip() if line and not line.startswith('//'): suffixes.append(line.lower()) return suffixes except requests.RequestException as e: print(f"Error fetching PSL data: {e}") return None def fetch_domain_zone_data(): """Fetch domain zone data using TLD list as authoritative source""" try: # Get the authoritative TLD list tld_response = requests.get(IANA_TLD_URL) tld_response.raise_for_status() lines = tld_response.text.strip().split('\n') all_tlds = [] for line in lines: line = line.strip() if line and not line.startswith('#'): all_tlds.append(line.lower()) print(f"Authoritative TLD list contains: {len(all_tlds)} TLDs") # Create simple domain list following TLD order domains = [] for tld in all_tlds: domains.append({ 'domain': tld # Store without dot prefix like domain_root }) print(f"Created domain list: {len(domains)} domains") print(f"First 5 domains: {[d['domain'] for d in domains[:5]]}") return domains except requests.RequestException as e: print(f"Error fetching TLD data: {e}") return None except Exception as e: print(f"Error processing domain data: {e}") return None def fetch_domain_detail(detail_url): """Fetch detailed information for a specific domain""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(detail_url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Extract domain name from URL or page domain = detail_url.split('/')[-1].lower() # Look for type and manager information domain_type = "generic" # default tld_manager = "Unknown" # Try to find type information type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE)) for element in type_elements: parent = element.parent if parent: next_sibling = parent.find_next_sibling() or parent.find_next() if next_sibling: domain_type = next_sibling.get_text(strip=True) break # Try to find manager information manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE)) for element in manager_elements: parent = element.parent if parent: next_sibling = parent.find_next_sibling() or parent.find_next() if next_sibling: tld_manager = next_sibling.get_text(strip=True) break return { 'domain': domain, 'type': domain_type, 'tld_manager': tld_manager } except Exception as e: print(f"Error fetching detail for {detail_url}: {e}") return None def enhance_domains_with_idn(domains): """Enhance domain list with IDN Unicode representations""" enhanced_domains = [] for domain_data in domains: domain = domain_data['domain'] enhanced_data = domain_data.copy() # Check if domain is Punycode and convert to Unicode if domain.startswith('xn--'): try: unicode_domain = idna.decode(domain) enhanced_data['unicode_domain'] = unicode_domain except: enhanced_data['unicode_domain'] = domain else: enhanced_data['unicode_domain'] = domain enhanced_domains.append(enhanced_data) return enhanced_domains def update_domain_root(tlds): """Update domain_root table with soft delete and new entries""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() # Get current entries in database cursor.execute("SELECT id, root FROM domain_root WHERE removed = FALSE") current_entries = {row[1]: row[0] for row in cursor.fetchall()} # Convert tlds to set for faster lookup tld_set = set(tlds) current_tlds = set(current_entries.keys()) # Mark entries as removed if not in source removed_tlds = current_tlds - tld_set if removed_tlds: print(f"Marking {len(removed_tlds)} TLDs as removed") for tld in removed_tlds: cursor.execute( "UPDATE domain_root SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE root = %s", (tld,) ) # Add new entries new_tlds = tld_set - current_tlds if new_tlds: print(f"Adding {len(new_tlds)} new TLDs") insert_query = "INSERT IGNORE INTO domain_root (root, removed) VALUES (%s, FALSE)" batch_size = 100 for i in range(0, len(new_tlds), batch_size): batch = list(new_tlds)[i:i + batch_size] data = [(tld,) for tld in batch] cursor.executemany(insert_query, data) conn.commit() print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} new TLDs") # Restore entries that were previously removed but now exist in source restored_tlds = current_tlds & tld_set cursor.execute("SELECT root FROM domain_root WHERE removed = TRUE AND root IN (%s)" % ','.join(['%s'] * len(restored_tlds)), list(restored_tlds)) to_restore = [row[0] for row in cursor.fetchall()] if to_restore: print(f"Restoring {len(to_restore)} previously removed TLDs") for tld in to_restore: cursor.execute( "UPDATE domain_root SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE root = %s", (tld,) ) # Update updated_at timestamp for all active entries that still exist in source verified_active = current_tlds & tld_set if verified_active: print(f"Updating timestamps for {len(verified_active)} verified active TLDs") cursor.execute( "UPDATE domain_root SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND root IN (%s)" % ','.join(['%s'] * len(verified_active)), list(verified_active) ) conn.commit() # Show statistics cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = FALSE") active_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = TRUE") removed_count = cursor.fetchone()[0] print(f"domain_root update completed:") print(f" Active entries: {active_count}") print(f" Removed entries: {removed_count}") print(f" New entries added: {len(new_tlds)}") print(f" Entries marked as removed: {len(removed_tlds)}") print(f" Entries restored: {len(to_restore)}") return True except mysql.connector.Error as e: print(f"Database error in domain_root: {e}") return False finally: if 'conn' in locals() and conn.is_connected(): cursor.close() conn.close() def update_domain_suffix(suffixes): """Update domain_suffix table with soft delete and new entries""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() # Get current entries in database cursor.execute("SELECT id, suffix FROM domain_suffix WHERE removed = FALSE") current_entries = {row[1]: row[0] for row in cursor.fetchall()} # Convert suffixes to set for faster lookup suffix_set = set(suffixes) current_suffixes = set(current_entries.keys()) # Mark entries as removed if not in source removed_suffixes = current_suffixes - suffix_set if removed_suffixes: print(f"Marking {len(removed_suffixes)} suffixes as removed") for suffix in removed_suffixes: cursor.execute( "UPDATE domain_suffix SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s", (suffix,) ) # Add new entries new_suffixes = suffix_set - current_suffixes if new_suffixes: print(f"Adding {len(new_suffixes)} new suffixes") insert_query = "INSERT IGNORE INTO domain_suffix (suffix, removed) VALUES (%s, FALSE)" batch_size = 100 for i in range(0, len(new_suffixes), batch_size): batch = list(new_suffixes)[i:i + batch_size] data = [(suffix,) for suffix in batch] cursor.executemany(insert_query, data) conn.commit() print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} new suffixes") # Restore entries that were previously removed but now exist in source restored_suffixes = current_suffixes & suffix_set cursor.execute("SELECT suffix FROM domain_suffix WHERE removed = TRUE AND suffix IN (%s)" % ','.join(['%s'] * len(restored_suffixes)), list(restored_suffixes)) to_restore = [row[0] for row in cursor.fetchall()] if to_restore: print(f"Restoring {len(to_restore)} previously removed suffixes") for suffix in to_restore: cursor.execute( "UPDATE domain_suffix SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s", (suffix,) ) # Update updated_at timestamp for all active entries that still exist in source verified_active = current_suffixes & suffix_set if verified_active: print(f"Updating timestamps for {len(verified_active)} verified active suffixes") cursor.execute( "UPDATE domain_suffix SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND suffix IN (%s)" % ','.join(['%s'] * len(verified_active)), list(verified_active) ) conn.commit() # Show statistics cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = FALSE") active_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = TRUE") removed_count = cursor.fetchone()[0] print(f"domain_suffix update completed:") print(f" Active entries: {active_count}") print(f" Removed entries: {removed_count}") print(f" New entries added: {len(new_suffixes)}") print(f" Entries marked as removed: {len(removed_suffixes)}") print(f" Entries restored: {len(to_restore)}") return True except mysql.connector.Error as e: print(f"Database error in domain_suffix: {e}") return False finally: if 'conn' in locals() and conn.is_connected(): cursor.close() conn.close() def update_domain_zone(domains): """Update domain_zone table with soft delete and new entries""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() # Get current entries in database cursor.execute("SELECT id, domain, root_utf, punycode FROM domain_zone WHERE removed = FALSE") current_entries = {row[1]: {'id': row[0], 'root_utf': row[2], 'punycode': row[3]} for row in cursor.fetchall()} # Convert domains to set for faster lookup domain_set = {domain['domain'] for domain in domains} current_domains = set(current_entries.keys()) # Create mapping for new data domain_data = {domain['domain']: {'unicode_domain': domain.get('unicode_domain', domain)} for domain in domains} # Mark entries as removed if not in source removed_domains = current_domains - domain_set if removed_domains: print(f"Marking {len(removed_domains)} domains as removed") for domain in removed_domains: cursor.execute( "UPDATE domain_zone SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE domain = %s", (domain,) ) # Add new entries new_domains = domain_set - current_domains if new_domains: print(f"Adding {len(new_domains)} new domains") insert_query = "INSERT IGNORE INTO domain_zone (domain, root_utf, punycode, removed) VALUES (%s, %s, %s, FALSE)" batch_size = 100 new_domain_list = list(new_domains) for i in range(0, len(new_domain_list), batch_size): batch = new_domain_list[i:i + batch_size] data = [] for domain in batch: unicode_domain = domain_data[domain]['unicode_domain'] is_punycode = domain.startswith('xn--') data.append((domain, unicode_domain, is_punycode)) cursor.executemany(insert_query, data) conn.commit() print(f"domain_zone batch {i//batch_size + 1}: {cursor.rowcount} new domains") # Update existing entries if root_utf changed common_domains = current_domains & domain_set for domain in common_domains: current_data = current_entries[domain] new_data = domain_data[domain] new_unicode = new_data['unicode_domain'] if current_data['root_utf'] != new_unicode: cursor.execute( "UPDATE domain_zone SET root_utf = %s, updated_at = CURRENT_TIMESTAMP WHERE domain = %s", (new_unicode, domain) ) # Restore entries that were previously removed but now exist in source if common_domains: cursor.execute("SELECT domain FROM domain_zone WHERE removed = TRUE AND domain IN (%s)" % ','.join(['%s'] * len(common_domains)), list(common_domains)) to_restore = [row[0] for row in cursor.fetchall()] else: to_restore = [] if to_restore: print(f"Restoring {len(to_restore)} previously removed domains") for domain in to_restore: new_unicode = domain_data[domain]['unicode_domain'] is_punycode = domain.startswith('xn--') cursor.execute( "UPDATE domain_zone SET removed = FALSE, root_utf = %s, punycode = %s, updated_at = CURRENT_TIMESTAMP WHERE domain = %s", (new_unicode, is_punycode, domain) ) # Update updated_at timestamp for all active entries that still exist in source if common_domains: print(f"Updating timestamps for {len(common_domains)} verified active domains") cursor.execute( "UPDATE domain_zone SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND domain IN (%s)" % ','.join(['%s'] * len(common_domains)), list(common_domains) ) conn.commit() # Show statistics cursor.execute("SELECT COUNT(*) FROM domain_zone WHERE removed = FALSE") active_count = cursor.fetchone()[0] cursor.execute("SELECT COUNT(*) FROM domain_zone WHERE removed = TRUE") removed_count = cursor.fetchone()[0] print(f"domain_zone update completed:") print(f" Active entries: {active_count}") print(f" Removed entries: {removed_count}") print(f" New entries added: {len(new_domains)}") print(f" Entries marked as removed: {len(removed_domains)}") print(f" Entries restored: {len(to_restore)}") return True except mysql.connector.Error as e: print(f"Database error in domain_zone: {e}") return False finally: if 'conn' in locals() and conn.is_connected(): cursor.close() conn.close() def show_sample_data(): """Show sample data from all tables""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() print("\n=== Sample data from domain_root table ===") cursor.execute("SELECT id, root, removed, created_at FROM domain_root ORDER BY id LIMIT 10") for row in cursor.fetchall(): status = "REMOVED" if row[2] else "ACTIVE" print(f"{row[0]} {row[1]} [{status}] {row[3]}") print("\n=== Sample data from domain_suffix table ===") cursor.execute("SELECT id, suffix, removed, created_at FROM domain_suffix ORDER BY id LIMIT 10") for row in cursor.fetchall(): status = "REMOVED" if row[2] else "ACTIVE" print(f"{row[0]} {row[1]} [{status}] {row[3]}") print("\n=== Sample data from domain_zone table ===") cursor.execute("SELECT id, domain, root_utf, punycode, removed, created_at FROM domain_zone ORDER BY id LIMIT 15") for row in cursor.fetchall(): status = "REMOVED" if row[4] else "ACTIVE" domain_display = row[1] root_utf_display = row[2] if row[2] else "" punycode_flag = "IDN" if row[3] else "REG" # Show Unicode representation for IDN domains if row[1].startswith('xn--') and row[2]: domain_display = f"{row[1]} ({row[2]})" elif row[1].startswith('xn--'): try: unicode_domain = idna.decode(row[1]) domain_display = f"{row[1]} ({unicode_domain})" except: domain_display = row[1] # Format the display if root_utf_display and root_utf_display != row[1]: print(f"{row[0]} {domain_display} [{punycode_flag}] UTF:{root_utf_display} [{status}] {row[5]}") else: print(f"{row[0]} {domain_display} [{punycode_flag}] [{status}] {row[5]}") cursor.close() conn.close() except mysql.connector.Error as e: print(f"Database error: {e}") def main(): import getpass # Get password from command line argument or prompt if len(sys.argv) > 1: password = sys.argv[1] else: password = getpass.getpass("Enter MariaDB password for user 'root': ") DB_CONFIG['password'] = password print("Starting domain tables update process with soft delete functionality...") # Fetch TLD data print(f"\nFetching TLD data from: {IANA_TLD_URL}") tlds = fetch_tld_data() if not tlds: print("Failed to fetch TLD data") sys.exit(1) print(f"Fetched {len(tlds)} TLDs") # Fetch PSL data print(f"\nFetching PSL data from: {PSL_URL}") suffixes = fetch_psl_data() if not suffixes: print("Failed to fetch PSL data") sys.exit(1) print(f"Fetched {len(suffixes)} suffixes") # Fetch domain zone data print(f"\nFetching domain zone data from TLD list and web scraping...") domains = fetch_domain_zone_data() if not domains: print("Failed to fetch domain zone data") sys.exit(1) print(f"Fetched {len(domains)} domain zones") # Enhance domains with IDN Unicode representations domains = enhance_domains_with_idn(domains) # Update domain_root table print(f"\nUpdating domain_root table...") if not update_domain_root(tlds): print("Failed to update domain_root table") sys.exit(1) # Update domain_suffix table print(f"\nUpdating domain_suffix table...") if not update_domain_suffix(suffixes): print("Failed to update domain_suffix table") sys.exit(1) # Update domain_zone table print(f"\nUpdating domain_zone table...") if not update_domain_zone(domains): print("Failed to update domain_zone table") sys.exit(1) # Show sample data show_sample_data() print("\n=== Tables update completed successfully ===") if __name__ == "__main__": main()