#!/usr/bin/env python3 """ Script to populate domain_root and domain_suffix tables separately """ import mysql.connector import requests import sys from datetime import datetime # Database connection configuration DB_CONFIG = { 'host': 'l2', 'port': 3306, 'user': 'root', 'password': None, # Will be set from command line or input 'database': 'sp_spider', 'charset': 'utf8mb4', 'ssl_disabled': True, 'auth_plugin': 'mysql_native_password' } # URLs for data sources IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt' PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat' def fetch_tld_data(): """Fetch TLD data from IANA""" try: response = requests.get(IANA_TLD_URL) response.raise_for_status() lines = response.text.strip().split('\n') tlds = [] for line in lines: line = line.strip() if line and not line.startswith('#'): tlds.append(line.lower()) return tlds except requests.RequestException as e: print(f"Error fetching TLD data: {e}") return None def fetch_psl_data(): """Fetch Public Suffix List data""" try: response = requests.get(PSL_URL) response.raise_for_status() lines = response.text.strip().split('\n') suffixes = [] for line in lines: line = line.strip() if line and not line.startswith('//'): suffixes.append(line.lower()) return suffixes except requests.RequestException as e: print(f"Error fetching PSL data: {e}") return None def populate_domain_root(tlds): """Populate domain_root table with TLD data""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() insert_query = "INSERT IGNORE INTO domain_root (root) VALUES (%s)" batch_size = 100 inserted_count = 0 for i in range(0, len(tlds), batch_size): batch = tlds[i:i + batch_size] data = [(tld,) for tld in batch] cursor.executemany(insert_query, data) inserted_count += cursor.rowcount conn.commit() print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} TLDs") print(f"Successfully inserted {inserted_count} TLDs into domain_root table") # Get total count cursor.execute("SELECT COUNT(*) FROM domain_root") total_count = cursor.fetchone()[0] print(f"Total TLDs in domain_root table: {total_count}") return True except mysql.connector.Error as e: print(f"Database error in domain_root: {e}") return False finally: if 'conn' in locals() and conn.is_connected(): cursor.close() conn.close() def populate_domain_suffix(suffixes): """Populate domain_suffix table with PSL data""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() insert_query = "INSERT IGNORE INTO domain_suffix (suffix) VALUES (%s)" batch_size = 100 inserted_count = 0 for i in range(0, len(suffixes), batch_size): batch = suffixes[i:i + batch_size] data = [(suffix,) for suffix in batch] cursor.executemany(insert_query, data) inserted_count += cursor.rowcount conn.commit() print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} suffixes") print(f"Successfully inserted {inserted_count} suffixes into domain_suffix table") # Get total count cursor.execute("SELECT COUNT(*) FROM domain_suffix") total_count = cursor.fetchone()[0] print(f"Total suffixes in domain_suffix table: {total_count}") return True except mysql.connector.Error as e: print(f"Database error in domain_suffix: {e}") return False finally: if 'conn' in locals() and conn.is_connected(): cursor.close() conn.close() def show_sample_data(): """Show sample data from both tables""" try: conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() print("\n=== Sample data from domain_root table ===") cursor.execute("SELECT id, root, created_at FROM domain_root ORDER BY id LIMIT 10") for row in cursor.fetchall(): print(f"{row[0]} {row[1]} {row[2]}") print("\n=== Sample data from domain_suffix table ===") cursor.execute("SELECT id, suffix, created_at FROM domain_suffix ORDER BY id LIMIT 10") for row in cursor.fetchall(): print(f"{row[0]} {row[1]} {row[2]}") cursor.close() conn.close() except mysql.connector.Error as e: print(f"Database error: {e}") def main(): import getpass # Get password from command line argument or prompt if len(sys.argv) > 1: password = sys.argv[1] else: password = getpass.getpass("Enter MariaDB password for user 'root': ") DB_CONFIG['password'] = password print("Starting separate tables population process...") # Fetch TLD data print(f"\nFetching TLD data from: {IANA_TLD_URL}") tlds = fetch_tld_data() if not tlds: print("Failed to fetch TLD data") sys.exit(1) print(f"Fetched {len(tlds)} TLDs") # Fetch PSL data print(f"\nFetching PSL data from: {PSL_URL}") suffixes = fetch_psl_data() if not suffixes: print("Failed to fetch PSL data") sys.exit(1) print(f"Fetched {len(suffixes)} suffixes") # Populate domain_root table print(f"\nPopulating domain_root table...") if not populate_domain_root(tlds): print("Failed to populate domain_root table") sys.exit(1) # Populate domain_suffix table print(f"\nPopulating domain_suffix table...") if not populate_domain_suffix(suffixes): print("Failed to populate domain_suffix table") sys.exit(1) # Show sample data show_sample_data() print("\n=== Tables population completed successfully ===") if __name__ == "__main__": main()