570 lines
22 KiB
Python
570 lines
22 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to update domain_root and domain_suffix tables with soft delete functionality
|
|
"""
|
|
|
|
import mysql.connector
|
|
import requests
|
|
import sys
|
|
from datetime import datetime
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
import idna
|
|
|
|
# Database connection configuration
|
|
DB_CONFIG = {
|
|
'host': 'l2',
|
|
'port': 3306,
|
|
'user': 'root',
|
|
'password': None, # Will be set from command line or input
|
|
'database': 'sp_spider',
|
|
'charset': 'utf8mb4',
|
|
'ssl_disabled': True,
|
|
'auth_plugin': 'mysql_native_password'
|
|
}
|
|
|
|
# URLs for data sources
|
|
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
|
|
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
|
|
IANA_ROOT_ZONE_URL = 'https://www.iana.org/domains/root/db'
|
|
|
|
def fetch_tld_data():
|
|
"""Fetch TLD data from IANA"""
|
|
try:
|
|
response = requests.get(IANA_TLD_URL)
|
|
response.raise_for_status()
|
|
|
|
lines = response.text.strip().split('\n')
|
|
tlds = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and not line.startswith('#'):
|
|
tlds.append(line.lower())
|
|
|
|
return tlds
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching TLD data: {e}")
|
|
return None
|
|
|
|
def fetch_psl_data():
|
|
"""Fetch Public Suffix List data"""
|
|
try:
|
|
response = requests.get(PSL_URL)
|
|
response.raise_for_status()
|
|
|
|
lines = response.text.strip().split('\n')
|
|
suffixes = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and not line.startswith('//'):
|
|
suffixes.append(line.lower())
|
|
|
|
return suffixes
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching PSL data: {e}")
|
|
return None
|
|
|
|
def fetch_domain_zone_data():
|
|
"""Fetch domain zone data using TLD list as authoritative source"""
|
|
try:
|
|
# Get the authoritative TLD list
|
|
tld_response = requests.get(IANA_TLD_URL)
|
|
tld_response.raise_for_status()
|
|
|
|
lines = tld_response.text.strip().split('\n')
|
|
all_tlds = []
|
|
for line in lines:
|
|
line = line.strip()
|
|
if line and not line.startswith('#'):
|
|
all_tlds.append(line.lower())
|
|
|
|
print(f"Authoritative TLD list contains: {len(all_tlds)} TLDs")
|
|
|
|
# Create simple domain list following TLD order
|
|
domains = []
|
|
for tld in all_tlds:
|
|
domains.append({
|
|
'domain': tld # Store without dot prefix like domain_root
|
|
})
|
|
|
|
print(f"Created domain list: {len(domains)} domains")
|
|
print(f"First 5 domains: {[d['domain'] for d in domains[:5]]}")
|
|
|
|
return domains
|
|
|
|
except requests.RequestException as e:
|
|
print(f"Error fetching TLD data: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error processing domain data: {e}")
|
|
return None
|
|
|
|
def fetch_domain_detail(detail_url):
|
|
"""Fetch detailed information for a specific domain"""
|
|
try:
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
response = requests.get(detail_url, headers=headers)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Extract domain name from URL or page
|
|
domain = detail_url.split('/')[-1].lower()
|
|
|
|
# Look for type and manager information
|
|
domain_type = "generic" # default
|
|
tld_manager = "Unknown"
|
|
|
|
# Try to find type information
|
|
type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE))
|
|
for element in type_elements:
|
|
parent = element.parent
|
|
if parent:
|
|
next_sibling = parent.find_next_sibling() or parent.find_next()
|
|
if next_sibling:
|
|
domain_type = next_sibling.get_text(strip=True)
|
|
break
|
|
|
|
# Try to find manager information
|
|
manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE))
|
|
for element in manager_elements:
|
|
parent = element.parent
|
|
if parent:
|
|
next_sibling = parent.find_next_sibling() or parent.find_next()
|
|
if next_sibling:
|
|
tld_manager = next_sibling.get_text(strip=True)
|
|
break
|
|
|
|
return {
|
|
'domain': domain,
|
|
'type': domain_type,
|
|
'tld_manager': tld_manager
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching detail for {detail_url}: {e}")
|
|
return None
|
|
|
|
def enhance_domains_with_idn(domains):
|
|
"""Enhance domain list with IDN Unicode representations"""
|
|
enhanced_domains = []
|
|
|
|
for domain_data in domains:
|
|
domain = domain_data['domain']
|
|
enhanced_data = domain_data.copy()
|
|
|
|
# Check if domain is Punycode and convert to Unicode
|
|
if domain.startswith('xn--'):
|
|
try:
|
|
unicode_domain = idna.decode(domain)
|
|
enhanced_data['unicode_domain'] = unicode_domain
|
|
except:
|
|
enhanced_data['unicode_domain'] = domain
|
|
else:
|
|
enhanced_data['unicode_domain'] = domain
|
|
|
|
enhanced_domains.append(enhanced_data)
|
|
|
|
return enhanced_domains
|
|
|
|
def update_domain_root(tlds):
|
|
"""Update domain_root table with soft delete and new entries"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
# Get current entries in database
|
|
cursor.execute("SELECT id, root FROM domain_root WHERE removed = FALSE")
|
|
current_entries = {row[1]: row[0] for row in cursor.fetchall()}
|
|
|
|
# Convert tlds to set for faster lookup
|
|
tld_set = set(tlds)
|
|
current_tlds = set(current_entries.keys())
|
|
|
|
# Mark entries as removed if not in source
|
|
removed_tlds = current_tlds - tld_set
|
|
if removed_tlds:
|
|
print(f"Marking {len(removed_tlds)} TLDs as removed")
|
|
for tld in removed_tlds:
|
|
cursor.execute(
|
|
"UPDATE domain_root SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE root = %s",
|
|
(tld,)
|
|
)
|
|
|
|
# Add new entries
|
|
new_tlds = tld_set - current_tlds
|
|
if new_tlds:
|
|
print(f"Adding {len(new_tlds)} new TLDs")
|
|
insert_query = "INSERT IGNORE INTO domain_root (root, removed) VALUES (%s, FALSE)"
|
|
batch_size = 100
|
|
|
|
for i in range(0, len(new_tlds), batch_size):
|
|
batch = list(new_tlds)[i:i + batch_size]
|
|
data = [(tld,) for tld in batch]
|
|
cursor.executemany(insert_query, data)
|
|
conn.commit()
|
|
print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} new TLDs")
|
|
|
|
# Restore entries that were previously removed but now exist in source
|
|
restored_tlds = current_tlds & tld_set
|
|
cursor.execute("SELECT root FROM domain_root WHERE removed = TRUE AND root IN (%s)" %
|
|
','.join(['%s'] * len(restored_tlds)), list(restored_tlds))
|
|
to_restore = [row[0] for row in cursor.fetchall()]
|
|
|
|
if to_restore:
|
|
print(f"Restoring {len(to_restore)} previously removed TLDs")
|
|
for tld in to_restore:
|
|
cursor.execute(
|
|
"UPDATE domain_root SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE root = %s",
|
|
(tld,)
|
|
)
|
|
|
|
# Update updated_at timestamp for all active entries that still exist in source
|
|
verified_active = current_tlds & tld_set
|
|
if verified_active:
|
|
print(f"Updating timestamps for {len(verified_active)} verified active TLDs")
|
|
cursor.execute(
|
|
"UPDATE domain_root SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND root IN (%s)" %
|
|
','.join(['%s'] * len(verified_active)), list(verified_active)
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
# Show statistics
|
|
cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = FALSE")
|
|
active_count = cursor.fetchone()[0]
|
|
cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = TRUE")
|
|
removed_count = cursor.fetchone()[0]
|
|
|
|
print(f"domain_root update completed:")
|
|
print(f" Active entries: {active_count}")
|
|
print(f" Removed entries: {removed_count}")
|
|
print(f" New entries added: {len(new_tlds)}")
|
|
print(f" Entries marked as removed: {len(removed_tlds)}")
|
|
print(f" Entries restored: {len(to_restore)}")
|
|
|
|
return True
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error in domain_root: {e}")
|
|
return False
|
|
finally:
|
|
if 'conn' in locals() and conn.is_connected():
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def update_domain_suffix(suffixes):
|
|
"""Update domain_suffix table with soft delete and new entries"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
# Get current entries in database
|
|
cursor.execute("SELECT id, suffix FROM domain_suffix WHERE removed = FALSE")
|
|
current_entries = {row[1]: row[0] for row in cursor.fetchall()}
|
|
|
|
# Convert suffixes to set for faster lookup
|
|
suffix_set = set(suffixes)
|
|
current_suffixes = set(current_entries.keys())
|
|
|
|
# Mark entries as removed if not in source
|
|
removed_suffixes = current_suffixes - suffix_set
|
|
if removed_suffixes:
|
|
print(f"Marking {len(removed_suffixes)} suffixes as removed")
|
|
for suffix in removed_suffixes:
|
|
cursor.execute(
|
|
"UPDATE domain_suffix SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s",
|
|
(suffix,)
|
|
)
|
|
|
|
# Add new entries
|
|
new_suffixes = suffix_set - current_suffixes
|
|
if new_suffixes:
|
|
print(f"Adding {len(new_suffixes)} new suffixes")
|
|
insert_query = "INSERT IGNORE INTO domain_suffix (suffix, removed) VALUES (%s, FALSE)"
|
|
batch_size = 100
|
|
|
|
for i in range(0, len(new_suffixes), batch_size):
|
|
batch = list(new_suffixes)[i:i + batch_size]
|
|
data = [(suffix,) for suffix in batch]
|
|
cursor.executemany(insert_query, data)
|
|
conn.commit()
|
|
print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} new suffixes")
|
|
|
|
# Restore entries that were previously removed but now exist in source
|
|
restored_suffixes = current_suffixes & suffix_set
|
|
cursor.execute("SELECT suffix FROM domain_suffix WHERE removed = TRUE AND suffix IN (%s)" %
|
|
','.join(['%s'] * len(restored_suffixes)), list(restored_suffixes))
|
|
to_restore = [row[0] for row in cursor.fetchall()]
|
|
|
|
if to_restore:
|
|
print(f"Restoring {len(to_restore)} previously removed suffixes")
|
|
for suffix in to_restore:
|
|
cursor.execute(
|
|
"UPDATE domain_suffix SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s",
|
|
(suffix,)
|
|
)
|
|
|
|
# Update updated_at timestamp for all active entries that still exist in source
|
|
verified_active = current_suffixes & suffix_set
|
|
if verified_active:
|
|
print(f"Updating timestamps for {len(verified_active)} verified active suffixes")
|
|
cursor.execute(
|
|
"UPDATE domain_suffix SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND suffix IN (%s)" %
|
|
','.join(['%s'] * len(verified_active)), list(verified_active)
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
# Show statistics
|
|
cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = FALSE")
|
|
active_count = cursor.fetchone()[0]
|
|
cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = TRUE")
|
|
removed_count = cursor.fetchone()[0]
|
|
|
|
print(f"domain_suffix update completed:")
|
|
print(f" Active entries: {active_count}")
|
|
print(f" Removed entries: {removed_count}")
|
|
print(f" New entries added: {len(new_suffixes)}")
|
|
print(f" Entries marked as removed: {len(removed_suffixes)}")
|
|
print(f" Entries restored: {len(to_restore)}")
|
|
|
|
return True
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error in domain_suffix: {e}")
|
|
return False
|
|
finally:
|
|
if 'conn' in locals() and conn.is_connected():
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def update_domain_zone(domains):
|
|
"""Update domain_zone table with soft delete and new entries"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
# Get current entries in database
|
|
cursor.execute("SELECT id, domain, root_utf, punycode FROM domain_zone WHERE removed = FALSE")
|
|
current_entries = {row[1]: {'id': row[0], 'root_utf': row[2], 'punycode': row[3]} for row in cursor.fetchall()}
|
|
|
|
# Convert domains to set for faster lookup
|
|
domain_set = {domain['domain'] for domain in domains}
|
|
current_domains = set(current_entries.keys())
|
|
|
|
# Create mapping for new data
|
|
domain_data = {domain['domain']: {'unicode_domain': domain.get('unicode_domain', domain)} for domain in domains}
|
|
|
|
# Mark entries as removed if not in source
|
|
removed_domains = current_domains - domain_set
|
|
if removed_domains:
|
|
print(f"Marking {len(removed_domains)} domains as removed")
|
|
for domain in removed_domains:
|
|
cursor.execute(
|
|
"UPDATE domain_zone SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE domain = %s",
|
|
(domain,)
|
|
)
|
|
|
|
# Add new entries
|
|
new_domains = domain_set - current_domains
|
|
if new_domains:
|
|
print(f"Adding {len(new_domains)} new domains")
|
|
insert_query = "INSERT IGNORE INTO domain_zone (domain, root_utf, punycode, removed) VALUES (%s, %s, %s, FALSE)"
|
|
batch_size = 100
|
|
|
|
new_domain_list = list(new_domains)
|
|
for i in range(0, len(new_domain_list), batch_size):
|
|
batch = new_domain_list[i:i + batch_size]
|
|
data = []
|
|
for domain in batch:
|
|
unicode_domain = domain_data[domain]['unicode_domain']
|
|
is_punycode = domain.startswith('xn--')
|
|
data.append((domain, unicode_domain, is_punycode))
|
|
cursor.executemany(insert_query, data)
|
|
conn.commit()
|
|
print(f"domain_zone batch {i//batch_size + 1}: {cursor.rowcount} new domains")
|
|
|
|
# Update existing entries if root_utf changed
|
|
common_domains = current_domains & domain_set
|
|
for domain in common_domains:
|
|
current_data = current_entries[domain]
|
|
new_data = domain_data[domain]
|
|
new_unicode = new_data['unicode_domain']
|
|
|
|
if current_data['root_utf'] != new_unicode:
|
|
cursor.execute(
|
|
"UPDATE domain_zone SET root_utf = %s, updated_at = CURRENT_TIMESTAMP WHERE domain = %s",
|
|
(new_unicode, domain)
|
|
)
|
|
|
|
# Restore entries that were previously removed but now exist in source
|
|
if common_domains:
|
|
cursor.execute("SELECT domain FROM domain_zone WHERE removed = TRUE AND domain IN (%s)" %
|
|
','.join(['%s'] * len(common_domains)), list(common_domains))
|
|
to_restore = [row[0] for row in cursor.fetchall()]
|
|
else:
|
|
to_restore = []
|
|
|
|
if to_restore:
|
|
print(f"Restoring {len(to_restore)} previously removed domains")
|
|
for domain in to_restore:
|
|
new_unicode = domain_data[domain]['unicode_domain']
|
|
is_punycode = domain.startswith('xn--')
|
|
cursor.execute(
|
|
"UPDATE domain_zone SET removed = FALSE, root_utf = %s, punycode = %s, updated_at = CURRENT_TIMESTAMP WHERE domain = %s",
|
|
(new_unicode, is_punycode, domain)
|
|
)
|
|
|
|
# Update updated_at timestamp for all active entries that still exist in source
|
|
if common_domains:
|
|
print(f"Updating timestamps for {len(common_domains)} verified active domains")
|
|
cursor.execute(
|
|
"UPDATE domain_zone SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND domain IN (%s)" %
|
|
','.join(['%s'] * len(common_domains)), list(common_domains)
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
# Show statistics
|
|
cursor.execute("SELECT COUNT(*) FROM domain_zone WHERE removed = FALSE")
|
|
active_count = cursor.fetchone()[0]
|
|
cursor.execute("SELECT COUNT(*) FROM domain_zone WHERE removed = TRUE")
|
|
removed_count = cursor.fetchone()[0]
|
|
|
|
print(f"domain_zone update completed:")
|
|
print(f" Active entries: {active_count}")
|
|
print(f" Removed entries: {removed_count}")
|
|
print(f" New entries added: {len(new_domains)}")
|
|
print(f" Entries marked as removed: {len(removed_domains)}")
|
|
print(f" Entries restored: {len(to_restore)}")
|
|
|
|
return True
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error in domain_zone: {e}")
|
|
return False
|
|
finally:
|
|
if 'conn' in locals() and conn.is_connected():
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def show_sample_data():
|
|
"""Show sample data from all tables"""
|
|
try:
|
|
conn = mysql.connector.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
print("\n=== Sample data from domain_root table ===")
|
|
cursor.execute("SELECT id, root, removed, created_at FROM domain_root ORDER BY id LIMIT 10")
|
|
for row in cursor.fetchall():
|
|
status = "REMOVED" if row[2] else "ACTIVE"
|
|
print(f"{row[0]} {row[1]} [{status}] {row[3]}")
|
|
|
|
print("\n=== Sample data from domain_suffix table ===")
|
|
cursor.execute("SELECT id, suffix, removed, created_at FROM domain_suffix ORDER BY id LIMIT 10")
|
|
for row in cursor.fetchall():
|
|
status = "REMOVED" if row[2] else "ACTIVE"
|
|
print(f"{row[0]} {row[1]} [{status}] {row[3]}")
|
|
|
|
print("\n=== Sample data from domain_zone table ===")
|
|
cursor.execute("SELECT id, domain, root_utf, punycode, removed, created_at FROM domain_zone ORDER BY id LIMIT 15")
|
|
for row in cursor.fetchall():
|
|
status = "REMOVED" if row[4] else "ACTIVE"
|
|
domain_display = row[1]
|
|
root_utf_display = row[2] if row[2] else ""
|
|
punycode_flag = "IDN" if row[3] else "REG"
|
|
|
|
# Show Unicode representation for IDN domains
|
|
if row[1].startswith('xn--') and row[2]:
|
|
domain_display = f"{row[1]} ({row[2]})"
|
|
elif row[1].startswith('xn--'):
|
|
try:
|
|
unicode_domain = idna.decode(row[1])
|
|
domain_display = f"{row[1]} ({unicode_domain})"
|
|
except:
|
|
domain_display = row[1]
|
|
|
|
# Format the display
|
|
if root_utf_display and root_utf_display != row[1]:
|
|
print(f"{row[0]} {domain_display} [{punycode_flag}] UTF:{root_utf_display} [{status}] {row[5]}")
|
|
else:
|
|
print(f"{row[0]} {domain_display} [{punycode_flag}] [{status}] {row[5]}")
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
except mysql.connector.Error as e:
|
|
print(f"Database error: {e}")
|
|
|
|
def main():
|
|
import getpass
|
|
|
|
# Get password from command line argument or prompt
|
|
if len(sys.argv) > 1:
|
|
password = sys.argv[1]
|
|
else:
|
|
password = getpass.getpass("Enter MariaDB password for user 'root': ")
|
|
|
|
DB_CONFIG['password'] = password
|
|
|
|
print("Starting domain tables update process with soft delete functionality...")
|
|
|
|
# Fetch TLD data
|
|
print(f"\nFetching TLD data from: {IANA_TLD_URL}")
|
|
tlds = fetch_tld_data()
|
|
if not tlds:
|
|
print("Failed to fetch TLD data")
|
|
sys.exit(1)
|
|
print(f"Fetched {len(tlds)} TLDs")
|
|
|
|
# Fetch PSL data
|
|
print(f"\nFetching PSL data from: {PSL_URL}")
|
|
suffixes = fetch_psl_data()
|
|
if not suffixes:
|
|
print("Failed to fetch PSL data")
|
|
sys.exit(1)
|
|
print(f"Fetched {len(suffixes)} suffixes")
|
|
|
|
# Fetch domain zone data
|
|
print(f"\nFetching domain zone data from TLD list and web scraping...")
|
|
domains = fetch_domain_zone_data()
|
|
if not domains:
|
|
print("Failed to fetch domain zone data")
|
|
sys.exit(1)
|
|
print(f"Fetched {len(domains)} domain zones")
|
|
|
|
# Enhance domains with IDN Unicode representations
|
|
domains = enhance_domains_with_idn(domains)
|
|
|
|
# Update domain_root table
|
|
print(f"\nUpdating domain_root table...")
|
|
if not update_domain_root(tlds):
|
|
print("Failed to update domain_root table")
|
|
sys.exit(1)
|
|
|
|
# Update domain_suffix table
|
|
print(f"\nUpdating domain_suffix table...")
|
|
if not update_domain_suffix(suffixes):
|
|
print("Failed to update domain_suffix table")
|
|
sys.exit(1)
|
|
|
|
# Update domain_zone table
|
|
print(f"\nUpdating domain_zone table...")
|
|
if not update_domain_zone(domains):
|
|
print("Failed to update domain_zone table")
|
|
sys.exit(1)
|
|
|
|
# Show sample data
|
|
show_sample_data()
|
|
|
|
print("\n=== Tables update completed successfully ===")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|