Source of file BrokenScriptsURLS.php
Size: 3,584 Bytes - Last Modified: 2021-12-24T06:19:30+00:00
/var/www/docs.ssmods.com/process/src/code/tasks/BrokenScriptsURLS.php
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 | <?php class BrokenScriptsURLS extends BuildTask { protected $title = 'Search module scripts and templates for URLs that are broken'; protected $description = 'A task that records external broken links in the source code of scripts and templates'; protected $enabled = true; protected $checkExtensions = array( 'php', 'md', 'js', 'ss' ); // list of fairly generic domains used as examples protected $skipDomains = array( 'my-host.com', 'myhost.com', 'mysite.com', 'test.com', 'example.com', 'example.org', 'mydomain', 'website.com', 'playboy.com' ); function run($request) { // no point continuing without curl_init if(!function_exists('curl_init')) { echo '<p>curl_init is not available</p>'; return; } $module = ($request->getVar('module')) ? $request->getVar('module') : 'cms'; // some folders we may want to ignore like changelogs in the framework module $excludeDir = ($request->getVar('excludeDir')) ? $request->getVar('excludeDir') : ''; $folder = BASE_PATH . DIRECTORY_SEPARATOR . $module; if (!file_exists($folder)) { return; } $iter = new RecursiveIteratorIterator( new RecursiveDirectoryIterator($folder, RecursiveDirectoryIterator::SKIP_DOTS), RecursiveIteratorIterator::SELF_FIRST, RecursiveIteratorIterator::CATCH_GET_CHILD ); $scripts = array(); foreach ($iter as $path => $dir) { if (!$dir->isDir() && in_array(pathinfo($path, PATHINFO_EXTENSION), $this->checkExtensions)) { if (empty($excludeDir) || strpos(pathinfo($path, PATHINFO_DIRNAME), $excludeDir) == FALSE) { $scripts[] = $path; } } } $tlds = ''; // TODO: Will need to add support for Punycode URLs at some point $tldObj = TopLevelDomain::get() ->filter(array('Enabled' => 1, 'Punycode' => 0)); foreach ($tldObj as $tld) { $tlds = (empty($tlds)) ? $tld->TLD : $tlds . '|' . trim($tld->TLD); } // create a RegExp for domains we want to ignore $ignoreDomainRegExp = ''; foreach ($this->skipDomains as $domain) { $ignoreDomainRegExp = (empty($ignoreDomainRegExp)) ? $domain : $ignoreDomainRegExp . '|' . $domain; } // the tlds are a important part of this regexp mainly so lines without a valid tld // are not matched $search = "~([\.-\w]*)(\.)($tlds)(\?|/)([\?\.a-zA-Z0-9\/=_#&%\~-]*)[\n\r\z]*~i"; foreach ($scripts as $script) { $fileText = file_get_contents($script); preg_match_all($search, $fileText, $matches); $href = false; if (count($matches) > 0) { foreach ($matches[0] as $value) { // set href to null for domains we want to skip $href = preg_match("~($ignoreDomainRegExp)~i", $value) ? null : $value; if($href) { $handle = curl_init($href); curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE); $response = curl_exec($handle); $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE); curl_close($handle); if (($httpCode < 200 || $httpCode > 302) || ($href == '' || $href[0] == '/')) { echo "<p>Checking script $script</p>"; echo "<p>URL $value returns HTTP Code $httpCode</p>"; $brokenLink = BrokenURL::get() ->filter(array( 'URL' => $href, 'Module' => $module, 'Script' => basename($script) )); if (!$brokenLink->exists()) { $brokenLink = new BrokenURL(); $brokenLink->URL = $href; $brokenLink->Module = $module; $brokenLink->Script = basename($script); $brokenLink->HTTPCode = $httpCode; $brokenLink->write(); } } } } } } } } |