207 lines
6.3 KiB
Python
207 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to populate domain_root and domain_suffix tables separately
|
|
"""
|
|
|
|
import mysql.connector
|
|
import requests
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
# Database connection configuration
|
|
DB_CONFIG = {
|
|
'host': 'l2',
|
|
'port': 3306,
|
|
'user': 'root',
|
|
'password': None, # Will be set from command line or input
|
|
'database': 'sp_spider',
|
|
'charset': 'utf8mb4',
|
|
'ssl_disabled': True,
|
|
'auth_plugin': 'mysql_native_password'
|
|
}
|
|
|
|
# URLs for data sources
|
|
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
|
|
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
|
|
|
|
def fetch_tld_data():
|
|
"""Fetch TLD data from IANA"""
|
|
try:
|
|
response = requests.get(IANA_TLD_URL)
|
|
response.raise_for_status()
|
|
|
|
lines = response.text.strip().split('\n')
|
|
tlds = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and not line.startswith('#'):
|
|
tlds.append(line.lower())
|
|
|
|
return tlds
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching TLD data: {e}")
|
|
return None
|
|
|
|
def fetch_psl_data():
|
|
"""Fetch Public Suffix List data"""
|
|
try:
|
|
response = requests.get(PSL_URL)
|
|
response.raise_for_status()
|
|
|
|
lines = response.text.strip().split('\n')
|
|
suffixes = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and not line.startswith('//'):
|
|
suffixes.append(line.lower())
|
|
|
|
return suffixes
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching PSL data: {e}")
|
|
return None
|
|
|
|
def populate_domain_root(tlds):
|
|
"""Populate domain_root table with TLD data"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
insert_query = "INSERT IGNORE INTO domain_root (root) VALUES (%s)"
|
|
|
|
batch_size = 100
|
|
inserted_count = 0
|
|
|
|
for i in range(0, len(tlds), batch_size):
|
|
batch = tlds[i:i + batch_size]
|
|
data = [(tld,) for tld in batch]
|
|
cursor.executemany(insert_query, data)
|
|
inserted_count += cursor.rowcount
|
|
conn.commit()
|
|
print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} TLDs")
|
|
|
|
print(f"Successfully inserted {inserted_count} TLDs into domain_root table")
|
|
|
|
# Get total count
|
|
cursor.execute("SELECT COUNT(*) FROM domain_root")
|
|
total_count = cursor.fetchone()[0]
|
|
print(f"Total TLDs in domain_root table: {total_count}")
|
|
|
|
return True
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error in domain_root: {e}")
|
|
return False
|
|
finally:
|
|
if 'conn' in locals() and conn.is_connected():
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def populate_domain_suffix(suffixes):
|
|
"""Populate domain_suffix table with PSL data"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
insert_query = "INSERT IGNORE INTO domain_suffix (suffix) VALUES (%s)"
|
|
|
|
batch_size = 100
|
|
inserted_count = 0
|
|
|
|
for i in range(0, len(suffixes), batch_size):
|
|
batch = suffixes[i:i + batch_size]
|
|
data = [(suffix,) for suffix in batch]
|
|
cursor.executemany(insert_query, data)
|
|
inserted_count += cursor.rowcount
|
|
conn.commit()
|
|
print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} suffixes")
|
|
|
|
print(f"Successfully inserted {inserted_count} suffixes into domain_suffix table")
|
|
|
|
# Get total count
|
|
cursor.execute("SELECT COUNT(*) FROM domain_suffix")
|
|
total_count = cursor.fetchone()[0]
|
|
print(f"Total suffixes in domain_suffix table: {total_count}")
|
|
|
|
return True
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error in domain_suffix: {e}")
|
|
return False
|
|
finally:
|
|
if 'conn' in locals() and conn.is_connected():
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def show_sample_data():
|
|
"""Show sample data from both tables"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
print("\n=== Sample data from domain_root table ===")
|
|
cursor.execute("SELECT id, root, created_at FROM domain_root ORDER BY id LIMIT 10")
|
|
for row in cursor.fetchall():
|
|
print(f"{row[0]} {row[1]} {row[2]}")
|
|
|
|
print("\n=== Sample data from domain_suffix table ===")
|
|
cursor.execute("SELECT id, suffix, created_at FROM domain_suffix ORDER BY id LIMIT 10")
|
|
for row in cursor.fetchall():
|
|
print(f"{row[0]} {row[1]} {row[2]}")
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error: {e}")
|
|
|
|
def main():
|
|
import getpass
|
|
|
|
# Get password from command line argument or prompt
|
|
if len(sys.argv) > 1:
|
|
password = sys.argv[1]
|
|
else:
|
|
password = getpass.getpass("Enter MariaDB password for user 'root': ")
|
|
|
|
DB_CONFIG['password'] = password
|
|
|
|
print("Starting separate tables population process...")
|
|
|
|
# Fetch TLD data
|
|
print(f"\nFetching TLD data from: {IANA_TLD_URL}")
|
|
tlds = fetch_tld_data()
|
|
if not tlds:
|
|
print("Failed to fetch TLD data")
|
|
sys.exit(1)
|
|
print(f"Fetched {len(tlds)} TLDs")
|
|
|
|
# Fetch PSL data
|
|
print(f"\nFetching PSL data from: {PSL_URL}")
|
|
suffixes = fetch_psl_data()
|
|
if not suffixes:
|
|
print("Failed to fetch PSL data")
|
|
sys.exit(1)
|
|
print(f"Fetched {len(suffixes)} suffixes")
|
|
|
|
# Populate domain_root table
|
|
print(f"\nPopulating domain_root table...")
|
|
if not populate_domain_root(tlds):
|
|
print("Failed to populate domain_root table")
|
|
sys.exit(1)
|
|
|
|
# Populate domain_suffix table
|
|
print(f"\nPopulating domain_suffix table...")
|
|
if not populate_domain_suffix(suffixes):
|
|
print("Failed to populate domain_suffix table")
|
|
sys.exit(1)
|
|
|
|
# Show sample data
|
|
show_sample_data()
|
|
|
|
print("\n=== Tables population completed successfully ===")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|