This commit is contained in:
Kar
2026-03-11 23:08:57 +05:30
parent 26e70981ee
commit 95aab950da
6 changed files with 892 additions and 1 deletions

263
update.py
View File

@@ -7,6 +7,9 @@ import mysql.connector
import requests
import sys
from datetime import datetime
from bs4 import BeautifulSoup
import re
import idna
# Database connection configuration
DB_CONFIG = {
@@ -23,6 +26,7 @@ DB_CONFIG = {
# URLs for data sources
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
IANA_ROOT_ZONE_URL = 'https://www.iana.org/domains/root/db'
def fetch_tld_data():
"""Fetch TLD data from IANA"""
@@ -62,6 +66,112 @@ def fetch_psl_data():
print(f"Error fetching PSL data: {e}")
return None
def fetch_domain_zone_data():
"""Fetch domain zone data using TLD list as authoritative source"""
try:
# Get the authoritative TLD list
tld_response = requests.get(IANA_TLD_URL)
tld_response.raise_for_status()
lines = tld_response.text.strip().split('\n')
all_tlds = []
for line in lines:
line = line.strip()
if line and not line.startswith('#'):
all_tlds.append(line.lower())
print(f"Authoritative TLD list contains: {len(all_tlds)} TLDs")
# Create simple domain list following TLD order
domains = []
for tld in all_tlds:
domains.append({
'domain': tld # Store without dot prefix like domain_root
})
print(f"Created domain list: {len(domains)} domains")
print(f"First 5 domains: {[d['domain'] for d in domains[:5]]}")
return domains
except requests.RequestException as e:
print(f"Error fetching TLD data: {e}")
return None
except Exception as e:
print(f"Error processing domain data: {e}")
return None
def fetch_domain_detail(detail_url):
"""Fetch detailed information for a specific domain"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(detail_url, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract domain name from URL or page
domain = detail_url.split('/')[-1].lower()
# Look for type and manager information
domain_type = "generic" # default
tld_manager = "Unknown"
# Try to find type information
type_elements = soup.find_all(text=re.compile(r'Type|Type of domain', re.IGNORECASE))
for element in type_elements:
parent = element.parent
if parent:
next_sibling = parent.find_next_sibling() or parent.find_next()
if next_sibling:
domain_type = next_sibling.get_text(strip=True)
break
# Try to find manager information
manager_elements = soup.find_all(text=re.compile(r'Manager|Sponsor|Registry', re.IGNORECASE))
for element in manager_elements:
parent = element.parent
if parent:
next_sibling = parent.find_next_sibling() or parent.find_next()
if next_sibling:
tld_manager = next_sibling.get_text(strip=True)
break
return {
'domain': domain,
'type': domain_type,
'tld_manager': tld_manager
}
except Exception as e:
print(f"Error fetching detail for {detail_url}: {e}")
return None
def enhance_domains_with_idn(domains):
"""Enhance domain list with IDN Unicode representations"""
enhanced_domains = []
for domain_data in domains:
domain = domain_data['domain']
enhanced_data = domain_data.copy()
# Check if domain is Punycode and convert to Unicode
if domain.startswith('xn--'):
try:
unicode_domain = idna.decode(domain)
enhanced_data['unicode_domain'] = unicode_domain
except:
enhanced_data['unicode_domain'] = domain
else:
enhanced_data['unicode_domain'] = domain
enhanced_domains.append(enhanced_data)
return enhanced_domains
def update_domain_root(tlds):
"""Update domain_root table with soft delete and new entries"""
try:
@@ -234,8 +344,118 @@ def update_domain_suffix(suffixes):
cursor.close()
conn.close()
def update_domain_zone(domains):
"""Update domain_zone table with soft delete and new entries"""
try:
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()
# Get current entries in database
cursor.execute("SELECT id, domain, root_utf, punycode FROM domain_zone WHERE removed = FALSE")
current_entries = {row[1]: {'id': row[0], 'root_utf': row[2], 'punycode': row[3]} for row in cursor.fetchall()}
# Convert domains to set for faster lookup
domain_set = {domain['domain'] for domain in domains}
current_domains = set(current_entries.keys())
# Create mapping for new data
domain_data = {domain['domain']: {'unicode_domain': domain.get('unicode_domain', domain)} for domain in domains}
# Mark entries as removed if not in source
removed_domains = current_domains - domain_set
if removed_domains:
print(f"Marking {len(removed_domains)} domains as removed")
for domain in removed_domains:
cursor.execute(
"UPDATE domain_zone SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE domain = %s",
(domain,)
)
# Add new entries
new_domains = domain_set - current_domains
if new_domains:
print(f"Adding {len(new_domains)} new domains")
insert_query = "INSERT IGNORE INTO domain_zone (domain, root_utf, punycode, removed) VALUES (%s, %s, %s, FALSE)"
batch_size = 100
new_domain_list = list(new_domains)
for i in range(0, len(new_domain_list), batch_size):
batch = new_domain_list[i:i + batch_size]
data = []
for domain in batch:
unicode_domain = domain_data[domain]['unicode_domain']
is_punycode = domain.startswith('xn--')
data.append((domain, unicode_domain, is_punycode))
cursor.executemany(insert_query, data)
conn.commit()
print(f"domain_zone batch {i//batch_size + 1}: {cursor.rowcount} new domains")
# Update existing entries if root_utf changed
common_domains = current_domains & domain_set
for domain in common_domains:
current_data = current_entries[domain]
new_data = domain_data[domain]
new_unicode = new_data['unicode_domain']
if current_data['root_utf'] != new_unicode:
cursor.execute(
"UPDATE domain_zone SET root_utf = %s, updated_at = CURRENT_TIMESTAMP WHERE domain = %s",
(new_unicode, domain)
)
# Restore entries that were previously removed but now exist in source
if common_domains:
cursor.execute("SELECT domain FROM domain_zone WHERE removed = TRUE AND domain IN (%s)" %
','.join(['%s'] * len(common_domains)), list(common_domains))
to_restore = [row[0] for row in cursor.fetchall()]
else:
to_restore = []
if to_restore:
print(f"Restoring {len(to_restore)} previously removed domains")
for domain in to_restore:
new_unicode = domain_data[domain]['unicode_domain']
is_punycode = domain.startswith('xn--')
cursor.execute(
"UPDATE domain_zone SET removed = FALSE, root_utf = %s, punycode = %s, updated_at = CURRENT_TIMESTAMP WHERE domain = %s",
(new_unicode, is_punycode, domain)
)
# Update updated_at timestamp for all active entries that still exist in source
if common_domains:
print(f"Updating timestamps for {len(common_domains)} verified active domains")
cursor.execute(
"UPDATE domain_zone SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND domain IN (%s)" %
','.join(['%s'] * len(common_domains)), list(common_domains)
)
conn.commit()
# Show statistics
cursor.execute("SELECT COUNT(*) FROM domain_zone WHERE removed = FALSE")
active_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM domain_zone WHERE removed = TRUE")
removed_count = cursor.fetchone()[0]
print(f"domain_zone update completed:")
print(f" Active entries: {active_count}")
print(f" Removed entries: {removed_count}")
print(f" New entries added: {len(new_domains)}")
print(f" Entries marked as removed: {len(removed_domains)}")
print(f" Entries restored: {len(to_restore)}")
return True
except mysql.connector.Error as e:
print(f"Database error in domain_zone: {e}")
return False
finally:
if 'conn' in locals() and conn.is_connected():
cursor.close()
conn.close()
def show_sample_data():
"""Show sample data from both tables"""
"""Show sample data from all tables"""
try:
conn = mysql.connector.connect(**DB_CONFIG)
cursor = conn.cursor()
@@ -252,6 +472,30 @@ def show_sample_data():
status = "REMOVED" if row[2] else "ACTIVE"
print(f"{row[0]} {row[1]} [{status}] {row[3]}")
print("\n=== Sample data from domain_zone table ===")
cursor.execute("SELECT id, domain, root_utf, punycode, removed, created_at FROM domain_zone ORDER BY id LIMIT 15")
for row in cursor.fetchall():
status = "REMOVED" if row[4] else "ACTIVE"
domain_display = row[1]
root_utf_display = row[2] if row[2] else ""
punycode_flag = "IDN" if row[3] else "REG"
# Show Unicode representation for IDN domains
if row[1].startswith('xn--') and row[2]:
domain_display = f"{row[1]} ({row[2]})"
elif row[1].startswith('xn--'):
try:
unicode_domain = idna.decode(row[1])
domain_display = f"{row[1]} ({unicode_domain})"
except:
domain_display = row[1]
# Format the display
if root_utf_display and root_utf_display != row[1]:
print(f"{row[0]} {domain_display} [{punycode_flag}] UTF:{root_utf_display} [{status}] {row[5]}")
else:
print(f"{row[0]} {domain_display} [{punycode_flag}] [{status}] {row[5]}")
cursor.close()
conn.close()
@@ -287,6 +531,17 @@ def main():
sys.exit(1)
print(f"Fetched {len(suffixes)} suffixes")
# Fetch domain zone data
print(f"\nFetching domain zone data from TLD list and web scraping...")
domains = fetch_domain_zone_data()
if not domains:
print("Failed to fetch domain zone data")
sys.exit(1)
print(f"Fetched {len(domains)} domain zones")
# Enhance domains with IDN Unicode representations
domains = enhance_domains_with_idn(domains)
# Update domain_root table
print(f"\nUpdating domain_root table...")
if not update_domain_root(tlds):
@@ -299,6 +554,12 @@ def main():
print("Failed to update domain_suffix table")
sys.exit(1)
# Update domain_zone table
print(f"\nUpdating domain_zone table...")
if not update_domain_zone(domains):
print("Failed to update domain_zone table")
sys.exit(1)
# Show sample data
show_sample_data()