init
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
*.log
|
||||||
147
README.md
Normal file
147
README.md
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
# Internet Domain Database
|
||||||
|
|
||||||
|
This project maintains two separate tables for internet domain data:
|
||||||
|
- `domain_root`: Top-Level Domains (TLDs) from IANA
|
||||||
|
- `domain_suffix`: Public Suffix List from Mozilla
|
||||||
|
|
||||||
|
## Database Connection
|
||||||
|
|
||||||
|
- **Host**: l2
|
||||||
|
- **Port**: 3306
|
||||||
|
- **User**: root
|
||||||
|
- **Database**: sp_spider
|
||||||
|
|
||||||
|
## Table Structure
|
||||||
|
|
||||||
|
### domain_root
|
||||||
|
Contains IANA TLD data with unique root domains and soft delete functionality.
|
||||||
|
```sql
|
||||||
|
CREATE TABLE domain_root (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
root VARCHAR(63) NOT NULL UNIQUE,
|
||||||
|
removed BOOLEAN DEFAULT FALSE,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
INDEX idx_root (root),
|
||||||
|
INDEX idx_removed (removed)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### domain_suffix
|
||||||
|
Contains Public Suffix List data with unique suffixes and soft delete functionality.
|
||||||
|
```sql
|
||||||
|
CREATE TABLE domain_suffix (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
suffix VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
removed BOOLEAN DEFAULT FALSE,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
INDEX idx_suffix (suffix),
|
||||||
|
INDEX idx_removed (removed)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
1. Install dependencies:
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create the tables:
|
||||||
|
```bash
|
||||||
|
mariadb -h l2 -u root -p0000 --ssl=FALSE sp_spider < create_separate_tables.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Add the removed column (for existing installations):
|
||||||
|
```bash
|
||||||
|
mariadb -h l2 -u root -p0000 --ssl=FALSE sp_spider < add_removed_column.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Populate the tables:
|
||||||
|
```bash
|
||||||
|
python populate_separate_tables.py 0000
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Update tables with soft delete functionality:
|
||||||
|
```bash
|
||||||
|
python update.py 0000
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data Sources
|
||||||
|
|
||||||
|
- **TLD Data**: https://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
||||||
|
- Contains official Top-Level Domains
|
||||||
|
- Currently: 1,436 TLDs
|
||||||
|
|
||||||
|
- **Public Suffix List**: https://publicsuffix.org/list/public_suffix_list.dat
|
||||||
|
- Contains public domain suffixes including TLDs, ccTLDs, and private domain suffixes
|
||||||
|
- Currently: 10,067 suffixes
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Query active TLDs:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM domain_root WHERE removed = FALSE AND root = 'com';
|
||||||
|
```
|
||||||
|
|
||||||
|
Query removed TLDs:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM domain_root WHERE removed = TRUE;
|
||||||
|
```
|
||||||
|
|
||||||
|
Query active suffixes:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM domain_suffix WHERE removed = FALSE AND suffix LIKE '%.com';
|
||||||
|
```
|
||||||
|
|
||||||
|
Query removed suffixes:
|
||||||
|
```sql
|
||||||
|
SELECT * FROM domain_suffix WHERE removed = TRUE;
|
||||||
|
```
|
||||||
|
|
||||||
|
Get statistics:
|
||||||
|
```sql
|
||||||
|
SELECT
|
||||||
|
'domain_root' as table_name,
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN removed = FALSE THEN 1 ELSE 0 END) as active,
|
||||||
|
SUM(CASE WHEN removed = TRUE THEN 1 ELSE 0 END) as removed
|
||||||
|
FROM domain_root
|
||||||
|
UNION ALL
|
||||||
|
SELECT
|
||||||
|
'domain_suffix' as table_name,
|
||||||
|
COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN removed = FALSE THEN 1 ELSE 0 END) as active,
|
||||||
|
SUM(CASE WHEN removed = TRUE THEN 1 ELSE 0 END) as removed
|
||||||
|
FROM domain_suffix;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Files
|
||||||
|
|
||||||
|
- `create_separate_tables.sql` - Table creation script
|
||||||
|
- `add_removed_column.sql` - Script to add removed column for soft delete functionality
|
||||||
|
- `populate_separate_tables.py` - Initial data population script
|
||||||
|
- `update.py` - Update script with soft delete functionality
|
||||||
|
- `requirements.txt` - Python dependencies
|
||||||
|
- `README.md` - This documentation
|
||||||
|
|
||||||
|
## Soft Delete Functionality
|
||||||
|
|
||||||
|
The `update.py` script provides soft delete functionality that:
|
||||||
|
|
||||||
|
1. **Marks entries as removed**: Sets `removed = TRUE` for entries no longer found in source data
|
||||||
|
2. **Adds new entries**: Inserts new entries from source with `removed = FALSE`
|
||||||
|
3. **Restores entries**: Sets `removed = FALSE` for previously removed entries that reappear in source
|
||||||
|
4. **Provides statistics**: Shows counts of active, removed, new, and restored entries
|
||||||
|
|
||||||
|
### Update Process
|
||||||
|
|
||||||
|
The update script:
|
||||||
|
- Fetches latest data from IANA TLD and Public Suffix List
|
||||||
|
- Compares with current database entries
|
||||||
|
- Performs batch updates for efficiency
|
||||||
|
- Handles duplicate entries gracefully with `INSERT IGNORE`
|
||||||
|
- Updates `updated_at` timestamp for all changes
|
||||||
|
|
||||||
|
Run the update script periodically to keep the database synchronized with source data.
|
||||||
11
add_removed_column.sql
Normal file
11
add_removed_column.sql
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
-- Add removed column to domain_root table
|
||||||
|
ALTER TABLE domain_root
|
||||||
|
ADD COLUMN removed BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
-- Add removed column to domain_suffix table
|
||||||
|
ALTER TABLE domain_suffix
|
||||||
|
ADD COLUMN removed BOOLEAN DEFAULT FALSE;
|
||||||
|
|
||||||
|
-- Add index for removed column for better query performance
|
||||||
|
ALTER TABLE domain_root ADD INDEX idx_removed (removed);
|
||||||
|
ALTER TABLE domain_suffix ADD INDEX idx_removed (removed);
|
||||||
17
create_separate_tables.sql
Normal file
17
create_separate_tables.sql
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
-- Create domain_root table for IANA TLD data
|
||||||
|
CREATE TABLE IF NOT EXISTS domain_root (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
root VARCHAR(63) NOT NULL UNIQUE,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
INDEX idx_root (root)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
-- Create domain_suffix table for Public Suffix List data
|
||||||
|
CREATE TABLE IF NOT EXISTS domain_suffix (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
suffix VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
INDEX idx_suffix (suffix)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||||
206
populate_separate_tables.py
Normal file
206
populate_separate_tables.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Script to populate domain_root and domain_suffix tables separately
|
||||||
|
"""
|
||||||
|
|
||||||
|
import mysql.connector
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Database connection configuration
|
||||||
|
DB_CONFIG = {
|
||||||
|
'host': 'l2',
|
||||||
|
'port': 3306,
|
||||||
|
'user': 'root',
|
||||||
|
'password': None, # Will be set from command line or input
|
||||||
|
'database': 'sp_spider',
|
||||||
|
'charset': 'utf8mb4',
|
||||||
|
'ssl_disabled': True,
|
||||||
|
'auth_plugin': 'mysql_native_password'
|
||||||
|
}
|
||||||
|
|
||||||
|
# URLs for data sources
|
||||||
|
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
|
||||||
|
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
|
||||||
|
|
||||||
|
def fetch_tld_data():
|
||||||
|
"""Fetch TLD data from IANA"""
|
||||||
|
try:
|
||||||
|
response = requests.get(IANA_TLD_URL)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
lines = response.text.strip().split('\n')
|
||||||
|
tlds = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('#'):
|
||||||
|
tlds.append(line.lower())
|
||||||
|
|
||||||
|
return tlds
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching TLD data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def fetch_psl_data():
|
||||||
|
"""Fetch Public Suffix List data"""
|
||||||
|
try:
|
||||||
|
response = requests.get(PSL_URL)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
lines = response.text.strip().split('\n')
|
||||||
|
suffixes = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('//'):
|
||||||
|
suffixes.append(line.lower())
|
||||||
|
|
||||||
|
return suffixes
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching PSL data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def populate_domain_root(tlds):
|
||||||
|
"""Populate domain_root table with TLD data"""
|
||||||
|
try:
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
insert_query = "INSERT IGNORE INTO domain_root (root) VALUES (%s)"
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
inserted_count = 0
|
||||||
|
|
||||||
|
for i in range(0, len(tlds), batch_size):
|
||||||
|
batch = tlds[i:i + batch_size]
|
||||||
|
data = [(tld,) for tld in batch]
|
||||||
|
cursor.executemany(insert_query, data)
|
||||||
|
inserted_count += cursor.rowcount
|
||||||
|
conn.commit()
|
||||||
|
print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} TLDs")
|
||||||
|
|
||||||
|
print(f"Successfully inserted {inserted_count} TLDs into domain_root table")
|
||||||
|
|
||||||
|
# Get total count
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM domain_root")
|
||||||
|
total_count = cursor.fetchone()[0]
|
||||||
|
print(f"Total TLDs in domain_root table: {total_count}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except mysql.connector.Error as e:
|
||||||
|
print(f"Database error in domain_root: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
if 'conn' in locals() and conn.is_connected():
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def populate_domain_suffix(suffixes):
|
||||||
|
"""Populate domain_suffix table with PSL data"""
|
||||||
|
try:
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
insert_query = "INSERT IGNORE INTO domain_suffix (suffix) VALUES (%s)"
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
inserted_count = 0
|
||||||
|
|
||||||
|
for i in range(0, len(suffixes), batch_size):
|
||||||
|
batch = suffixes[i:i + batch_size]
|
||||||
|
data = [(suffix,) for suffix in batch]
|
||||||
|
cursor.executemany(insert_query, data)
|
||||||
|
inserted_count += cursor.rowcount
|
||||||
|
conn.commit()
|
||||||
|
print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} suffixes")
|
||||||
|
|
||||||
|
print(f"Successfully inserted {inserted_count} suffixes into domain_suffix table")
|
||||||
|
|
||||||
|
# Get total count
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM domain_suffix")
|
||||||
|
total_count = cursor.fetchone()[0]
|
||||||
|
print(f"Total suffixes in domain_suffix table: {total_count}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except mysql.connector.Error as e:
|
||||||
|
print(f"Database error in domain_suffix: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
if 'conn' in locals() and conn.is_connected():
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def show_sample_data():
|
||||||
|
"""Show sample data from both tables"""
|
||||||
|
try:
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
print("\n=== Sample data from domain_root table ===")
|
||||||
|
cursor.execute("SELECT id, root, created_at FROM domain_root ORDER BY id LIMIT 10")
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
print(f"{row[0]} {row[1]} {row[2]}")
|
||||||
|
|
||||||
|
print("\n=== Sample data from domain_suffix table ===")
|
||||||
|
cursor.execute("SELECT id, suffix, created_at FROM domain_suffix ORDER BY id LIMIT 10")
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
print(f"{row[0]} {row[1]} {row[2]}")
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
except mysql.connector.Error as e:
|
||||||
|
print(f"Database error: {e}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import getpass
|
||||||
|
|
||||||
|
# Get password from command line argument or prompt
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
password = sys.argv[1]
|
||||||
|
else:
|
||||||
|
password = getpass.getpass("Enter MariaDB password for user 'root': ")
|
||||||
|
|
||||||
|
DB_CONFIG['password'] = password
|
||||||
|
|
||||||
|
print("Starting separate tables population process...")
|
||||||
|
|
||||||
|
# Fetch TLD data
|
||||||
|
print(f"\nFetching TLD data from: {IANA_TLD_URL}")
|
||||||
|
tlds = fetch_tld_data()
|
||||||
|
if not tlds:
|
||||||
|
print("Failed to fetch TLD data")
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Fetched {len(tlds)} TLDs")
|
||||||
|
|
||||||
|
# Fetch PSL data
|
||||||
|
print(f"\nFetching PSL data from: {PSL_URL}")
|
||||||
|
suffixes = fetch_psl_data()
|
||||||
|
if not suffixes:
|
||||||
|
print("Failed to fetch PSL data")
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Fetched {len(suffixes)} suffixes")
|
||||||
|
|
||||||
|
# Populate domain_root table
|
||||||
|
print(f"\nPopulating domain_root table...")
|
||||||
|
if not populate_domain_root(tlds):
|
||||||
|
print("Failed to populate domain_root table")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Populate domain_suffix table
|
||||||
|
print(f"\nPopulating domain_suffix table...")
|
||||||
|
if not populate_domain_suffix(suffixes):
|
||||||
|
print("Failed to populate domain_suffix table")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Show sample data
|
||||||
|
show_sample_data()
|
||||||
|
|
||||||
|
print("\n=== Tables population completed successfully ===")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
mysql-connector-python==8.2.0
|
||||||
|
requests==2.31.0
|
||||||
308
update.py
Normal file
308
update.py
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Script to update domain_root and domain_suffix tables with soft delete functionality
|
||||||
|
"""
|
||||||
|
|
||||||
|
import mysql.connector
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Database connection configuration
|
||||||
|
DB_CONFIG = {
|
||||||
|
'host': 'l2',
|
||||||
|
'port': 3306,
|
||||||
|
'user': 'root',
|
||||||
|
'password': None, # Will be set from command line or input
|
||||||
|
'database': 'sp_spider',
|
||||||
|
'charset': 'utf8mb4',
|
||||||
|
'ssl_disabled': True,
|
||||||
|
'auth_plugin': 'mysql_native_password'
|
||||||
|
}
|
||||||
|
|
||||||
|
# URLs for data sources
|
||||||
|
IANA_TLD_URL = 'https://data.iana.org/TLD/tlds-alpha-by-domain.txt'
|
||||||
|
PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'
|
||||||
|
|
||||||
|
def fetch_tld_data():
|
||||||
|
"""Fetch TLD data from IANA"""
|
||||||
|
try:
|
||||||
|
response = requests.get(IANA_TLD_URL)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
lines = response.text.strip().split('\n')
|
||||||
|
tlds = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('#'):
|
||||||
|
tlds.append(line.lower())
|
||||||
|
|
||||||
|
return tlds
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching TLD data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def fetch_psl_data():
|
||||||
|
"""Fetch Public Suffix List data"""
|
||||||
|
try:
|
||||||
|
response = requests.get(PSL_URL)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
lines = response.text.strip().split('\n')
|
||||||
|
suffixes = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('//'):
|
||||||
|
suffixes.append(line.lower())
|
||||||
|
|
||||||
|
return suffixes
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Error fetching PSL data: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def update_domain_root(tlds):
|
||||||
|
"""Update domain_root table with soft delete and new entries"""
|
||||||
|
try:
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get current entries in database
|
||||||
|
cursor.execute("SELECT id, root FROM domain_root WHERE removed = FALSE")
|
||||||
|
current_entries = {row[1]: row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
# Convert tlds to set for faster lookup
|
||||||
|
tld_set = set(tlds)
|
||||||
|
current_tlds = set(current_entries.keys())
|
||||||
|
|
||||||
|
# Mark entries as removed if not in source
|
||||||
|
removed_tlds = current_tlds - tld_set
|
||||||
|
if removed_tlds:
|
||||||
|
print(f"Marking {len(removed_tlds)} TLDs as removed")
|
||||||
|
for tld in removed_tlds:
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE domain_root SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE root = %s",
|
||||||
|
(tld,)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add new entries
|
||||||
|
new_tlds = tld_set - current_tlds
|
||||||
|
if new_tlds:
|
||||||
|
print(f"Adding {len(new_tlds)} new TLDs")
|
||||||
|
insert_query = "INSERT IGNORE INTO domain_root (root, removed) VALUES (%s, FALSE)"
|
||||||
|
batch_size = 100
|
||||||
|
|
||||||
|
for i in range(0, len(new_tlds), batch_size):
|
||||||
|
batch = list(new_tlds)[i:i + batch_size]
|
||||||
|
data = [(tld,) for tld in batch]
|
||||||
|
cursor.executemany(insert_query, data)
|
||||||
|
conn.commit()
|
||||||
|
print(f"domain_root batch {i//batch_size + 1}: {cursor.rowcount} new TLDs")
|
||||||
|
|
||||||
|
# Restore entries that were previously removed but now exist in source
|
||||||
|
restored_tlds = current_tlds & tld_set
|
||||||
|
cursor.execute("SELECT root FROM domain_root WHERE removed = TRUE AND root IN (%s)" %
|
||||||
|
','.join(['%s'] * len(restored_tlds)), list(restored_tlds))
|
||||||
|
to_restore = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if to_restore:
|
||||||
|
print(f"Restoring {len(to_restore)} previously removed TLDs")
|
||||||
|
for tld in to_restore:
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE domain_root SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE root = %s",
|
||||||
|
(tld,)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update updated_at timestamp for all active entries that still exist in source
|
||||||
|
verified_active = current_tlds & tld_set
|
||||||
|
if verified_active:
|
||||||
|
print(f"Updating timestamps for {len(verified_active)} verified active TLDs")
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE domain_root SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND root IN (%s)" %
|
||||||
|
','.join(['%s'] * len(verified_active)), list(verified_active)
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Show statistics
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = FALSE")
|
||||||
|
active_count = cursor.fetchone()[0]
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM domain_root WHERE removed = TRUE")
|
||||||
|
removed_count = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
print(f"domain_root update completed:")
|
||||||
|
print(f" Active entries: {active_count}")
|
||||||
|
print(f" Removed entries: {removed_count}")
|
||||||
|
print(f" New entries added: {len(new_tlds)}")
|
||||||
|
print(f" Entries marked as removed: {len(removed_tlds)}")
|
||||||
|
print(f" Entries restored: {len(to_restore)}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except mysql.connector.Error as e:
|
||||||
|
print(f"Database error in domain_root: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
if 'conn' in locals() and conn.is_connected():
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def update_domain_suffix(suffixes):
|
||||||
|
"""Update domain_suffix table with soft delete and new entries"""
|
||||||
|
try:
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Get current entries in database
|
||||||
|
cursor.execute("SELECT id, suffix FROM domain_suffix WHERE removed = FALSE")
|
||||||
|
current_entries = {row[1]: row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
|
# Convert suffixes to set for faster lookup
|
||||||
|
suffix_set = set(suffixes)
|
||||||
|
current_suffixes = set(current_entries.keys())
|
||||||
|
|
||||||
|
# Mark entries as removed if not in source
|
||||||
|
removed_suffixes = current_suffixes - suffix_set
|
||||||
|
if removed_suffixes:
|
||||||
|
print(f"Marking {len(removed_suffixes)} suffixes as removed")
|
||||||
|
for suffix in removed_suffixes:
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE domain_suffix SET removed = TRUE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s",
|
||||||
|
(suffix,)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add new entries
|
||||||
|
new_suffixes = suffix_set - current_suffixes
|
||||||
|
if new_suffixes:
|
||||||
|
print(f"Adding {len(new_suffixes)} new suffixes")
|
||||||
|
insert_query = "INSERT IGNORE INTO domain_suffix (suffix, removed) VALUES (%s, FALSE)"
|
||||||
|
batch_size = 100
|
||||||
|
|
||||||
|
for i in range(0, len(new_suffixes), batch_size):
|
||||||
|
batch = list(new_suffixes)[i:i + batch_size]
|
||||||
|
data = [(suffix,) for suffix in batch]
|
||||||
|
cursor.executemany(insert_query, data)
|
||||||
|
conn.commit()
|
||||||
|
print(f"domain_suffix batch {i//batch_size + 1}: {cursor.rowcount} new suffixes")
|
||||||
|
|
||||||
|
# Restore entries that were previously removed but now exist in source
|
||||||
|
restored_suffixes = current_suffixes & suffix_set
|
||||||
|
cursor.execute("SELECT suffix FROM domain_suffix WHERE removed = TRUE AND suffix IN (%s)" %
|
||||||
|
','.join(['%s'] * len(restored_suffixes)), list(restored_suffixes))
|
||||||
|
to_restore = [row[0] for row in cursor.fetchall()]
|
||||||
|
|
||||||
|
if to_restore:
|
||||||
|
print(f"Restoring {len(to_restore)} previously removed suffixes")
|
||||||
|
for suffix in to_restore:
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE domain_suffix SET removed = FALSE, updated_at = CURRENT_TIMESTAMP WHERE suffix = %s",
|
||||||
|
(suffix,)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update updated_at timestamp for all active entries that still exist in source
|
||||||
|
verified_active = current_suffixes & suffix_set
|
||||||
|
if verified_active:
|
||||||
|
print(f"Updating timestamps for {len(verified_active)} verified active suffixes")
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE domain_suffix SET updated_at = CURRENT_TIMESTAMP WHERE removed = FALSE AND suffix IN (%s)" %
|
||||||
|
','.join(['%s'] * len(verified_active)), list(verified_active)
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Show statistics
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = FALSE")
|
||||||
|
active_count = cursor.fetchone()[0]
|
||||||
|
cursor.execute("SELECT COUNT(*) FROM domain_suffix WHERE removed = TRUE")
|
||||||
|
removed_count = cursor.fetchone()[0]
|
||||||
|
|
||||||
|
print(f"domain_suffix update completed:")
|
||||||
|
print(f" Active entries: {active_count}")
|
||||||
|
print(f" Removed entries: {removed_count}")
|
||||||
|
print(f" New entries added: {len(new_suffixes)}")
|
||||||
|
print(f" Entries marked as removed: {len(removed_suffixes)}")
|
||||||
|
print(f" Entries restored: {len(to_restore)}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except mysql.connector.Error as e:
|
||||||
|
print(f"Database error in domain_suffix: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
if 'conn' in locals() and conn.is_connected():
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def show_sample_data():
|
||||||
|
"""Show sample data from both tables"""
|
||||||
|
try:
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
print("\n=== Sample data from domain_root table ===")
|
||||||
|
cursor.execute("SELECT id, root, removed, created_at FROM domain_root ORDER BY id LIMIT 10")
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
status = "REMOVED" if row[2] else "ACTIVE"
|
||||||
|
print(f"{row[0]} {row[1]} [{status}] {row[3]}")
|
||||||
|
|
||||||
|
print("\n=== Sample data from domain_suffix table ===")
|
||||||
|
cursor.execute("SELECT id, suffix, removed, created_at FROM domain_suffix ORDER BY id LIMIT 10")
|
||||||
|
for row in cursor.fetchall():
|
||||||
|
status = "REMOVED" if row[2] else "ACTIVE"
|
||||||
|
print(f"{row[0]} {row[1]} [{status}] {row[3]}")
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
except mysql.connector.Error as e:
|
||||||
|
print(f"Database error: {e}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import getpass
|
||||||
|
|
||||||
|
# Get password from command line argument or prompt
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
password = sys.argv[1]
|
||||||
|
else:
|
||||||
|
password = getpass.getpass("Enter MariaDB password for user 'root': ")
|
||||||
|
|
||||||
|
DB_CONFIG['password'] = password
|
||||||
|
|
||||||
|
print("Starting domain tables update process with soft delete functionality...")
|
||||||
|
|
||||||
|
# Fetch TLD data
|
||||||
|
print(f"\nFetching TLD data from: {IANA_TLD_URL}")
|
||||||
|
tlds = fetch_tld_data()
|
||||||
|
if not tlds:
|
||||||
|
print("Failed to fetch TLD data")
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Fetched {len(tlds)} TLDs")
|
||||||
|
|
||||||
|
# Fetch PSL data
|
||||||
|
print(f"\nFetching PSL data from: {PSL_URL}")
|
||||||
|
suffixes = fetch_psl_data()
|
||||||
|
if not suffixes:
|
||||||
|
print("Failed to fetch PSL data")
|
||||||
|
sys.exit(1)
|
||||||
|
print(f"Fetched {len(suffixes)} suffixes")
|
||||||
|
|
||||||
|
# Update domain_root table
|
||||||
|
print(f"\nUpdating domain_root table...")
|
||||||
|
if not update_domain_root(tlds):
|
||||||
|
print("Failed to update domain_root table")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Update domain_suffix table
|
||||||
|
print(f"\nUpdating domain_suffix table...")
|
||||||
|
if not update_domain_suffix(suffixes):
|
||||||
|
print("Failed to update domain_suffix table")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Show sample data
|
||||||
|
show_sample_data()
|
||||||
|
|
||||||
|
print("\n=== Tables update completed successfully ===")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user