Files
get-domain-suffix-iana/populate_separate_tables.py
2026-03-11 21:18:24 +05:30

207 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Script to populate domain_root and domain_suffix tables separately
"""
import mysql.connector
import requests
import sys
from datetime import datetime
# Database connection configuration
DB_CONFIG = {
'host': 'l2',
'port': 3306,
'user': 'root',
'password': None, # Will be set from command line or input
'database': 'sp_spider',
'charset': 'utf8mb4',
'ssl_disabled': True,
'auth_plugin': 'mysql_native_password'
}
# URLs for data sources
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
def fetch_tld_data():
"""Fetch TLD data from IANA"""
try:
response = requests.get(IANA_TLD_URL)
response.raise_for_status()
lines = response.text.strip().split('\n')
tlds = []
for line in lines:
line = line.strip()
if line and not line.startswith('#'):
tlds.append(line.lower())
return tlds
except requests.RequestException as e:
print(f"Error fetching TLD data: {e}")
return None
def fetch_psl_data():
"""Fetch Public Suffix List data"""
try:
response = requests.get(PSL_URL)
response.raise_for_status()
lines = response.text.strip().split('\n')
suffixes = []
for line in lines:
line = line.strip()
if line and not line.startswith('//'):
suffixes.append(line.lower())
return suffixes
except requests.RequestException as e:
print(f"Error fetching PSL data: {e}")
return None
def populate_domain_root(tlds):
"""Populate domain_root table with TLD data"""
try:
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()
insert_query = "INSERT IGNORE INTO domain_root (root) VALUES (%s)"
batch_size = 100
inserted_count = 0
for i in range(0, len(tlds), batch_size):
batch = tlds[i:i + batch_size]
data = [(tld,) for tld in batch]
cursor.executemany(insert_query, data)
inserted_count += cursor.rowcount
conn.commit()
print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} TLDs")
print(f"Successfully inserted {inserted_count} TLDs into domain_root table")
# Get total count
cursor.execute("SELECT COUNT(*) FROM domain_root")
total_count = cursor.fetchone()[0]
print(f"Total TLDs in domain_root table: {total_count}")
return True
except mysql.connector.Error as e:
print(f"Database error in domain_root: {e}")
return False
finally:
if 'conn' in locals() and conn.is_connected():
cursor.close()
conn.close()
def populate_domain_suffix(suffixes):
"""Populate domain_suffix table with PSL data"""
try:
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()
insert_query = "INSERT IGNORE INTO domain_suffix (suffix) VALUES (%s)"
batch_size = 100
inserted_count = 0
for i in range(0, len(suffixes), batch_size):
batch = suffixes[i:i + batch_size]
data = [(suffix,) for suffix in batch]
cursor.executemany(insert_query, data)
inserted_count += cursor.rowcount
conn.commit()
print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} suffixes")
print(f"Successfully inserted {inserted_count} suffixes into domain_suffix table")
# Get total count
cursor.execute("SELECT COUNT(*) FROM domain_suffix")
total_count = cursor.fetchone()[0]
print(f"Total suffixes in domain_suffix table: {total_count}")
return True
except mysql.connector.Error as e:
print(f"Database error in domain_suffix: {e}")
return False
finally:
if 'conn' in locals() and conn.is_connected():
cursor.close()
conn.close()
def show_sample_data():
"""Show sample data from both tables"""
try:
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()
print("\n=== Sample data from domain_root table ===")
cursor.execute("SELECT id, root, created_at FROM domain_root ORDER BY id LIMIT 10")
for row in cursor.fetchall():
print(f"{row[0]} {row[1]} {row[2]}")
print("\n=== Sample data from domain_suffix table ===")
cursor.execute("SELECT id, suffix, created_at FROM domain_suffix ORDER BY id LIMIT 10")
for row in cursor.fetchall():
print(f"{row[0]} {row[1]} {row[2]}")
cursor.close()
conn.close()
except mysql.connector.Error as e:
print(f"Database error: {e}")
def main():
import getpass
# Get password from command line argument or prompt
if len(sys.argv) > 1:
password = sys.argv[1]
else:
password = getpass.getpass("Enter MariaDB password for user 'root': ")
DB_CONFIG['password'] = password
print("Starting separate tables population process...")
# Fetch TLD data
print(f"\nFetching TLD data from: {IANA_TLD_URL}")
tlds = fetch_tld_data()
if not tlds:
print("Failed to fetch TLD data")
sys.exit(1)
print(f"Fetched {len(tlds)} TLDs")
# Fetch PSL data
print(f"\nFetching PSL data from: {PSL_URL}")
suffixes = fetch_psl_data()
if not suffixes:
print("Failed to fetch PSL data")
sys.exit(1)
print(f"Fetched {len(suffixes)} suffixes")
# Populate domain_root table
print(f"\nPopulating domain_root table...")
if not populate_domain_root(tlds):
print("Failed to populate domain_root table")
sys.exit(1)
# Populate domain_suffix table
print(f"\nPopulating domain_suffix table...")
if not populate_domain_suffix(suffixes):
print("Failed to populate domain_suffix table")
sys.exit(1)
# Show sample data
show_sample_data()
print("\n=== Tables population completed successfully ===")
if __name__ == "__main__":
main()