Files
tools/.hta_slug/schema-validator.php

88 lines
2.0 KiB
PHP

<?php
header('Content-Type: application/json; charset=utf-8');
$input = json_decode(file_get_contents("php://input"), true);
$url = $input['url'] ?? null;
$response = [
"success" => false,
"detected" => [],
"errors" => [],
"warnings" => [],
"info" => []
];
if (!$url || !filter_var($url, FILTER_VALIDATE_URL)) {
$response['errors'][] = "Valid URL required";
echo json_encode($response);
exit;
}
/* fetch html */
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => 20,
CURLOPT_USERAGENT => 'SchemaValidatorBot/1.0',
CURLOPT_ENCODING => '' // gzip support
]);
$html = curl_exec($ch);
curl_close($ch);
if (!$html) {
$response['errors'][] = "Failed to fetch URL";
echo json_encode($response);
exit;
}
/* extract JSON-LD */
preg_match_all(
'/<script[^>]+type=["\']application\/ld\+json["\'][^>]*>(.*?)<\/script>/is',
$html,
$matches
);
if (empty($matches[1])) {
$response['warnings'][] = "No schema found in initial HTML";
// heuristic for SPA
if (
stripos($html, '__NEXT_DATA__') !== false ||
stripos($html, 'id="root"') !== false ||
stripos($html, 'data-reactroot') !== false
) {
$response['info'][] =
"This site appears to be client-side rendered. Schema may exist but cannot be validated using PHP-only.";
}
echo json_encode($response, JSON_PRETTY_PRINT);
exit;
}
/* analyze schemas */
foreach ($matches[1] as $schema) {
$json = json_decode($schema, true);
if (json_last_error() !== JSON_ERROR_NONE) {
$response['errors'][] = json_last_error_msg();
continue;
}
if (isset($json['@type'])) {
$response['detected'][] = $json['@type'];
} elseif (isset($json[0])) {
foreach ($json as $item) {
if (isset($item['@type'])) {
$response['detected'][] = $item['@type'];
}
}
}
}
$response['detected'] = array_values(array_unique($response['detected']));
$response['success'] = empty($response['errors']);
echo json_encode($response, JSON_PRETTY_PRINT);