init
This commit is contained in:
206
populate_separate_tables.py
Normal file
206
populate_separate_tables.py
Normal file
@@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to populate domain_root and domain_suffix tables separately
|
||||
"""
|
||||
|
||||
import mysql.connector
|
||||
import requests
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# Database connection configuration
|
||||
DB_CONFIG = {
|
||||
'host': 'l2',
|
||||
'port': 3306,
|
||||
'user': 'root',
|
||||
'password': None, # Will be set from command line or input
|
||||
'database': 'sp_spider',
|
||||
'charset': 'utf8mb4',
|
||||
'ssl_disabled': True,
|
||||
'auth_plugin': 'mysql_native_password'
|
||||
}
|
||||
|
||||
# URLs for data sources
|
||||
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
|
||||
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
|
||||
|
||||
def fetch_tld_data():
|
||||
"""Fetch TLD data from IANA"""
|
||||
try:
|
||||
response = requests.get(IANA_TLD_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
lines = response.text.strip().split('\n')
|
||||
tlds = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
tlds.append(line.lower())
|
||||
|
||||
return tlds
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching TLD data: {e}")
|
||||
return None
|
||||
|
||||
def fetch_psl_data():
|
||||
"""Fetch Public Suffix List data"""
|
||||
try:
|
||||
response = requests.get(PSL_URL)
|
||||
response.raise_for_status()
|
||||
|
||||
lines = response.text.strip().split('\n')
|
||||
suffixes = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('//'):
|
||||
suffixes.append(line.lower())
|
||||
|
||||
return suffixes
|
||||
except requests.RequestException as e:
|
||||
print(f"Error fetching PSL data: {e}")
|
||||
return None
|
||||
|
||||
def populate_domain_root(tlds):
|
||||
"""Populate domain_root table with TLD data"""
|
||||
try:
|
||||
conn = mysql.connector.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
insert_query = "INSERT IGNORE INTO domain_root (root) VALUES (%s)"
|
||||
|
||||
batch_size = 100
|
||||
inserted_count = 0
|
||||
|
||||
for i in range(0, len(tlds), batch_size):
|
||||
batch = tlds[i:i + batch_size]
|
||||
data = [(tld,) for tld in batch]
|
||||
cursor.executemany(insert_query, data)
|
||||
inserted_count += cursor.rowcount
|
||||
conn.commit()
|
||||
print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} TLDs")
|
||||
|
||||
print(f"Successfully inserted {inserted_count} TLDs into domain_root table")
|
||||
|
||||
# Get total count
|
||||
cursor.execute("SELECT COUNT(*) FROM domain_root")
|
||||
total_count = cursor.fetchone()[0]
|
||||
print(f"Total TLDs in domain_root table: {total_count}")
|
||||
|
||||
return True
|
||||
|
||||
except mysql.connector.Error as e:
|
||||
print(f"Database error in domain_root: {e}")
|
||||
return False
|
||||
finally:
|
||||
if 'conn' in locals() and conn.is_connected():
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def populate_domain_suffix(suffixes):
|
||||
"""Populate domain_suffix table with PSL data"""
|
||||
try:
|
||||
conn = mysql.connector.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
insert_query = "INSERT IGNORE INTO domain_suffix (suffix) VALUES (%s)"
|
||||
|
||||
batch_size = 100
|
||||
inserted_count = 0
|
||||
|
||||
for i in range(0, len(suffixes), batch_size):
|
||||
batch = suffixes[i:i + batch_size]
|
||||
data = [(suffix,) for suffix in batch]
|
||||
cursor.executemany(insert_query, data)
|
||||
inserted_count += cursor.rowcount
|
||||
conn.commit()
|
||||
print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} suffixes")
|
||||
|
||||
print(f"Successfully inserted {inserted_count} suffixes into domain_suffix table")
|
||||
|
||||
# Get total count
|
||||
cursor.execute("SELECT COUNT(*) FROM domain_suffix")
|
||||
total_count = cursor.fetchone()[0]
|
||||
print(f"Total suffixes in domain_suffix table: {total_count}")
|
||||
|
||||
return True
|
||||
|
||||
except mysql.connector.Error as e:
|
||||
print(f"Database error in domain_suffix: {e}")
|
||||
return False
|
||||
finally:
|
||||
if 'conn' in locals() and conn.is_connected():
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def show_sample_data():
|
||||
"""Show sample data from both tables"""
|
||||
try:
|
||||
conn = mysql.connector.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print("\n=== Sample data from domain_root table ===")
|
||||
cursor.execute("SELECT id, root, created_at FROM domain_root ORDER BY id LIMIT 10")
|
||||
for row in cursor.fetchall():
|
||||
print(f"{row[0]} {row[1]} {row[2]}")
|
||||
|
||||
print("\n=== Sample data from domain_suffix table ===")
|
||||
cursor.execute("SELECT id, suffix, created_at FROM domain_suffix ORDER BY id LIMIT 10")
|
||||
for row in cursor.fetchall():
|
||||
print(f"{row[0]} {row[1]} {row[2]}")
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
except mysql.connector.Error as e:
|
||||
print(f"Database error: {e}")
|
||||
|
||||
def main():
|
||||
import getpass
|
||||
|
||||
# Get password from command line argument or prompt
|
||||
if len(sys.argv) > 1:
|
||||
password = sys.argv[1]
|
||||
else:
|
||||
password = getpass.getpass("Enter MariaDB password for user 'root': ")
|
||||
|
||||
DB_CONFIG['password'] = password
|
||||
|
||||
print("Starting separate tables population process...")
|
||||
|
||||
# Fetch TLD data
|
||||
print(f"\nFetching TLD data from: {IANA_TLD_URL}")
|
||||
tlds = fetch_tld_data()
|
||||
if not tlds:
|
||||
print("Failed to fetch TLD data")
|
||||
sys.exit(1)
|
||||
print(f"Fetched {len(tlds)} TLDs")
|
||||
|
||||
# Fetch PSL data
|
||||
print(f"\nFetching PSL data from: {PSL_URL}")
|
||||
suffixes = fetch_psl_data()
|
||||
if not suffixes:
|
||||
print("Failed to fetch PSL data")
|
||||
sys.exit(1)
|
||||
print(f"Fetched {len(suffixes)} suffixes")
|
||||
|
||||
# Populate domain_root table
|
||||
print(f"\nPopulating domain_root table...")
|
||||
if not populate_domain_root(tlds):
|
||||
print("Failed to populate domain_root table")
|
||||
sys.exit(1)
|
||||
|
||||
# Populate domain_suffix table
|
||||
print(f"\nPopulating domain_suffix table...")
|
||||
if not populate_domain_suffix(suffixes):
|
||||
print("Failed to populate domain_suffix table")
|
||||
sys.exit(1)
|
||||
|
||||
# Show sample data
|
||||
show_sample_data()
|
||||
|
||||
print("\n=== Tables population completed successfully ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user