update
This commit is contained in:
346
update_data_domain_zone.py
Normal file
346
update_data_domain_zone.py
Normal file
@@ -0,0 +1,346 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script to update domain_zone table with additional data fields
|
||||
"""
|
||||
|
||||
import mysql.connector
|
||||
import requests
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import idna
|
||||
|
||||
# Database connection configuration
|
||||
DB_CONFIG = {
|
||||
'host': 'l2',
|
||||
'port': 3306,
|
||||
'user': 'root',
|
||||
'password': None, # Will be set from command line or input
|
||||
'database': 'sp_spider',
|
||||
'charset': 'utf8mb4',
|
||||
'ssl_disabled': True,
|
||||
'auth_plugin': 'mysql_native_password'
|
||||
}
|
||||
|
||||
def get_iana_registry_info(domain):
|
||||
"""Get registry information from IANA for a domain"""
|
||||
try:
|
||||
# Remove dot prefix if present
|
||||
clean_domain = domain.lstrip('.')
|
||||
|
||||
# Try to access the IANA detail page
|
||||
url = f"https://www.iana.org/domains/root/db/{clean_domain}"
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
info = {
|
||||
'registration_url': None,
|
||||
'whois_server': None,
|
||||
'rdap_server': None,
|
||||
'name_servers': [],
|
||||
'sponsoring': None,
|
||||
'administrative': None,
|
||||
'technical': None
|
||||
}
|
||||
|
||||
# Extract registration URL
|
||||
reg_links = soup.find_all('a', href=re.compile(r'registrar|registry', re.IGNORECASE))
|
||||
if reg_links:
|
||||
info['registration_url'] = reg_links[0].get('href')
|
||||
|
||||
# Extract WHOIS server
|
||||
whois_text = soup.find(text=re.compile(r'WHOIS|whois', re.IGNORECASE))
|
||||
if whois_text:
|
||||
parent = whois_text.parent
|
||||
if parent and parent.name == 'td':
|
||||
next_td = parent.find_next_sibling('td')
|
||||
if next_td:
|
||||
info['whois_server'] = next_td.get_text(strip=True)
|
||||
|
||||
# Extract RDAP server
|
||||
rdap_text = soup.find(text=re.compile(r'RDAP|rdap', re.IGNORECASE))
|
||||
if rdap_text:
|
||||
parent = rdap_text.parent
|
||||
if parent and parent.name == 'td':
|
||||
next_td = parent.find_next_sibling('td')
|
||||
if next_td:
|
||||
info['rdap_server'] = next_td.get_text(strip=True)
|
||||
|
||||
# Extract name servers
|
||||
ns_text = soup.find(text=re.compile(r'name.?server|ns', re.IGNORECASE))
|
||||
if ns_text:
|
||||
parent = ns_text.parent
|
||||
if parent and parent.name == 'td':
|
||||
next_td = parent.find_next_sibling('td')
|
||||
if next_td:
|
||||
ns_list = next_td.get_text(strip=True).split(',')
|
||||
info['name_servers'] = [ns.strip() for ns in ns_list if ns.strip()]
|
||||
|
||||
# Extract sponsoring organization
|
||||
sponsor_text = soup.find(text=re.compile(r'sponsor|registry|manager', re.IGNORECASE))
|
||||
if sponsor_text:
|
||||
parent = sponsor_text.parent
|
||||
if parent and parent.name == 'td':
|
||||
next_td = parent.find_next_sibling('td')
|
||||
if next_td:
|
||||
info['sponsoring'] = next_td.get_text(strip=True)
|
||||
|
||||
# Extract administrative contact
|
||||
admin_text = soup.find(text=re.compile(r'administrative|admin', re.IGNORECASE))
|
||||
if admin_text:
|
||||
parent = admin_text.parent
|
||||
if parent and parent.name == 'td':
|
||||
next_td = parent.find_next_sibling('td')
|
||||
if next_td:
|
||||
info['administrative'] = next_td.get_text(strip=True)
|
||||
|
||||
# Extract technical contact
|
||||
tech_text = soup.find(text=re.compile(r'technical|tech', re.IGNORECASE))
|
||||
if tech_text:
|
||||
parent = tech_text.parent
|
||||
if parent and parent.name == 'td':
|
||||
next_td = parent.find_next_sibling('td')
|
||||
if next_td:
|
||||
info['technical'] = next_td.get_text(strip=True)
|
||||
|
||||
return info
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching IANA info for {domain}: {e}")
|
||||
return None
|
||||
|
||||
def get_rdap_info(domain):
|
||||
"""Get RDAP information for a domain"""
|
||||
try:
|
||||
# Remove dot prefix if present
|
||||
clean_domain = domain.lstrip('.')
|
||||
|
||||
# Try common RDAP servers
|
||||
rdap_servers = [
|
||||
f"https://rdap.org/domain/{clean_domain}",
|
||||
f"https://data.iana.org/rdap/{clean_domain}",
|
||||
f"https://rdap.verisign.com/com/v1/domain/{clean_domain}",
|
||||
f"https://rdap.nic.fr/domain/{clean_domain}"
|
||||
]
|
||||
|
||||
for rdap_url in rdap_servers:
|
||||
try:
|
||||
response = requests.get(rdap_url, timeout=5)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
|
||||
info = {
|
||||
'rdap_server': rdap_url.split('/')[2], # Extract server domain
|
||||
'name_servers': [],
|
||||
'port43': None
|
||||
}
|
||||
|
||||
# Extract name servers from RDAP data
|
||||
if 'nameservers' in data:
|
||||
info['name_servers'] = [ns.get('ldhName', '') for ns in data['nameservers'] if ns.get('ldhName')]
|
||||
|
||||
# Extract WHOIS (port43) server
|
||||
if 'port43' in data:
|
||||
info['port43'] = data['port43'].get('server', '')
|
||||
|
||||
return info
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching RDAP info for {domain}: {e}")
|
||||
return None
|
||||
|
||||
def get_dns_servers(domain):
|
||||
"""Get DNS servers using DNS resolution"""
|
||||
try:
|
||||
import dns.resolver
|
||||
clean_domain = domain.lstrip('.')
|
||||
|
||||
# Try to get NS records
|
||||
answers = dns.resolver.resolve(clean_domain, 'NS')
|
||||
name_servers = [str(rdata) for rdata in answers]
|
||||
|
||||
return {'name_servers': name_servers}
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def update_domain_data():
|
||||
"""Update domain_zone table with additional data"""
|
||||
try:
|
||||
conn = mysql.connector.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get domains that need data enrichment
|
||||
cursor.execute("""
|
||||
SELECT id, domain, root_utf
|
||||
FROM domain_zone
|
||||
WHERE removed = FALSE
|
||||
AND (registration_url IS NULL OR whois_server IS NULL OR rdap_server IS NULL)
|
||||
ORDER BY id
|
||||
LIMIT 50
|
||||
""")
|
||||
|
||||
domains_to_update = cursor.fetchall()
|
||||
|
||||
if not domains_to_update:
|
||||
print("All domains already have complete data!")
|
||||
return True
|
||||
|
||||
print(f"Updating data for {len(domains_to_update)} domains...")
|
||||
|
||||
for domain_id, domain, root_utf in domains_to_update:
|
||||
print(f"Processing {domain}...")
|
||||
|
||||
# Get IANA registry information
|
||||
iana_info = get_iana_registry_info(domain)
|
||||
|
||||
# Get RDAP information
|
||||
rdap_info = get_rdap_info(domain)
|
||||
|
||||
# Get DNS servers
|
||||
dns_info = get_dns_servers(domain)
|
||||
|
||||
# Merge information
|
||||
update_data = {
|
||||
'registration_url': iana_info.get('registration_url') if iana_info else None,
|
||||
'whois_server': iana_info.get('whois_server') if iana_info else None,
|
||||
'rdap_server': rdap_info.get('rdap_server') if rdap_info else None,
|
||||
'name_servers': json.dumps(rdap_info.get('name_servers', []) if rdap_info else (dns_info.get('name_servers', []) if dns_info else [])),
|
||||
'sponsoring': iana_info.get('sponsoring') if iana_info else None,
|
||||
'administrative': iana_info.get('administrative') if iana_info else None,
|
||||
'technical': iana_info.get('technical') if iana_info else None
|
||||
}
|
||||
|
||||
# Update database
|
||||
cursor.execute("""
|
||||
UPDATE domain_zone SET
|
||||
registration_url = %s,
|
||||
whois_server = %s,
|
||||
rdap_server = %s,
|
||||
name_servers = %s,
|
||||
sponsoring = %s,
|
||||
administrative = %s,
|
||||
technical = %s,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = %s
|
||||
""", (
|
||||
update_data['registration_url'],
|
||||
update_data['whois_server'],
|
||||
update_data['rdap_server'],
|
||||
update_data['name_servers'],
|
||||
update_data['sponsoring'],
|
||||
update_data['administrative'],
|
||||
update_data['technical'],
|
||||
domain_id
|
||||
))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Rate limiting
|
||||
time.sleep(1)
|
||||
|
||||
print(f"Updated {len(domains_to_update)} domains successfully")
|
||||
|
||||
# Show statistics
|
||||
cursor.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(CASE WHEN registration_url IS NOT NULL THEN 1 END) as with_reg_url,
|
||||
COUNT(CASE WHEN whois_server IS NOT NULL THEN 1 END) as with_whois,
|
||||
COUNT(CASE WHEN rdap_server IS NOT NULL THEN 1 END) as with_rdap,
|
||||
COUNT(CASE WHEN name_servers IS NOT NULL THEN 1 END) as with_ns
|
||||
FROM domain_zone
|
||||
WHERE removed = FALSE
|
||||
""")
|
||||
|
||||
stats = cursor.fetchone()
|
||||
print(f"\nDatabase Statistics:")
|
||||
print(f" Total domains: {stats[0]}")
|
||||
print(f" With registration URL: {stats[1]}")
|
||||
print(f" With WHOIS server: {stats[2]}")
|
||||
print(f" With RDAP server: {stats[3]}")
|
||||
print(f" With name servers: {stats[4]}")
|
||||
|
||||
return True
|
||||
|
||||
except mysql.connector.Error as e:
|
||||
print(f"Database error: {e}")
|
||||
return False
|
||||
finally:
|
||||
if 'conn' in locals() and conn.is_connected():
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def show_sample_data():
|
||||
"""Show sample data from domain_zone table"""
|
||||
try:
|
||||
conn = mysql.connector.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
print("\n=== Sample enriched data from domain_zone table ===")
|
||||
cursor.execute("""
|
||||
SELECT id, domain, root_utf, whois_server, rdap_server,
|
||||
JSON_LENGTH(name_servers) as ns_count, sponsoring
|
||||
FROM domain_zone
|
||||
WHERE removed = FALSE
|
||||
AND (registration_url IS NOT NULL OR whois_server IS NOT NULL)
|
||||
ORDER BY id
|
||||
LIMIT 10
|
||||
""")
|
||||
|
||||
for row in cursor.fetchall():
|
||||
domain_display = row[1]
|
||||
if row[1].startswith('xn--') and row[2]:
|
||||
domain_display = f"{row[1]} ({row[2]})"
|
||||
|
||||
print(f"{row[0]} {domain_display}")
|
||||
print(f" WHOIS: {row[3] or 'N/A'}")
|
||||
print(f" RDAP: {row[4] or 'N/A'}")
|
||||
print(f" Name Servers: {row[5] or 0}")
|
||||
print(f" Sponsor: {row[6] or 'N/A'}")
|
||||
print()
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
except mysql.connector.Error as e:
|
||||
print(f"Database error: {e}")
|
||||
|
||||
def main():
|
||||
import getpass
|
||||
|
||||
# Get password from command line argument or prompt
|
||||
if len(sys.argv) > 1:
|
||||
password = sys.argv[1]
|
||||
else:
|
||||
password = getpass.getpass("Enter MariaDB password for user 'root': ")
|
||||
|
||||
DB_CONFIG['password'] = password
|
||||
|
||||
print("Starting domain_zone data enrichment process...")
|
||||
|
||||
# Update domain data
|
||||
if not update_domain_data():
|
||||
print("Failed to update domain data")
|
||||
sys.exit(1)
|
||||
|
||||
# Show sample data
|
||||
show_sample_data()
|
||||
|
||||
print("\n=== Domain data enrichment completed ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user