Source of file StaticSiteUrlList.php
Size: 18,599 Bytes - Last Modified: 2021-12-23T10:34:50+00:00
/var/www/docs.ssmods.com/process/src/code/StaticSiteUrlList.php
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613 | <?php require_once('../vendor/cuab/phpcrawl/libs/PHPCrawler.class.php'); /** * Represents a set of URLs parsed from a site. * * Makes use of PHPCrawl to prepare a list of URLs on the site */ class StaticSiteUrlList { protected $baseURL, $cacheDir; /** * Two element array: contains keys 'inferred' and 'regular': * - 'regular' is an array mapping raw URLs to processed URLs * - 'inferred' is an array of inferred URLs */ protected $urls = null; protected $autoCrawl = false; protected $urlProcessor = null; protected $extraCrawlURLs = null; /** * A list of regular expression patterns to exclude from scraping * * @var array */ protected $excludePatterns = array(); /** * Create a new URL List * @param string $baseURL The Base URL to find links on * @param string $cacheDir The local path to cache data into */ public function __construct($baseURL, $cacheDir) { // baseURL mus not have a trailing slash if (substr($baseURL, -1) == "/") { $baseURL = substr($baseURL, 0, -1); } // cacheDir must have a trailing slash if (substr($cacheDir, -1) != "/") { $cacheDir .= "/"; } $this->baseURL = $baseURL; $this->cacheDir = $cacheDir; } /** * Set a URL processor for this URL List. * * URL processors process the URLs before the site heirarchy and inferred meta-data are generated. * These can be used to tranform URLs from CMSes that don't provide a natural heirarchy into something * more useful. * * See {@link StaticSiteMOSSURLProcessor} for an example. * * @param StaticSiteUrlProcessor $urlProcessor [description] */ public function setUrlProcessor(StaticSiteUrlProcessor $urlProcessor) { $this->urlProcessor = $urlProcessor; } /** * Define additional crawl URLs as an array * Each of these URLs will be crawled in addition the base URL. * This can be helpful if pages are getting missed by the crawl */ public function setExtraCrawlURls($extraCrawlURLs) { $this->extraCrawlURLs = $extraCrawlURLs; } /** * Return the additional crawl URLs as an array */ public function getExtraCrawlURLs() { return $this->extraCrawlURLs; } /** * Set an array of regular expression patterns that should be excluded from * being added to the url list * * @param array $excludePatterns */ public function setExcludePatterns(array $excludePatterns) { $this->excludePatterns = $excludePatterns; } /** * Get an array of regular expression patterns that should not be added to * the url list * * @return array */ public function getExcludePatterns() { return $this->excludePatterns; } /** * * Set whether the crawl should be triggered on demand. * @param [type] $autoCrawl [description] */ public function setAutoCrawl($autoCrawl) { $this->autoCrawl = $autoCrawl; } /** * Returns the status of the spidering: "Complete", "Partial", or "Not started" * @return [type] [description] */ public function getSpiderStatus() { if (file_exists($this->cacheDir . 'urls')) { if (file_exists($this->cacheDir . 'crawlerid')) { return "Partial"; } else { return "Complete"; } } else { return "Not started"; } } /** * Return the number of URLs crawled so far */ public function getNumURLs() { if ($this->urls) { $urls = $this->urls; // Don't rely on loadUrls() as it chokes on partially completed imports } elseif (file_exists($this->cacheDir . 'urls')) { $urls = unserialize(file_get_contents($this->cacheDir . 'urls')); } else { return null; } return sizeof(array_unique($urls['regular'])) + sizeof($urls['inferred']); } /** * Return the raw URLs as an array * @return array */ public function getRawURLs() { if ($urls = $this->getProcessedURLs()) { return array_keys($urls); } } /** * Return a map of URLs crawled, with raw URLs as keys and processed URLs as values * @return array */ public function getProcessedURLs() { if ($this->hasCrawled() || $this->autoCrawl) { if ($this->urls === null) { $this->loadUrls(); } return array_merge( $this->urls['regular'], $this->urls['inferred'] ? array_combine($this->urls['inferred'], $this->urls['inferred']) : array() ); } } public function hasCrawled() { // There are URLs and we're not in the middle of a crawl return file_exists($this->cacheDir . 'urls') && !file_exists($this->cacheDir . 'crawlerid'); } /** * Load the URLs, either by crawling, or by fetching from cache * @return void */ public function loadUrls() { if ($this->hasCrawled()) { $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); // Clear out obsolete format if (!isset($this->urls['regular']) || !isset($this->urls['inferred'])) { $this->urls = array('regular' => array(), 'inferred' => array()); } } elseif ($this->autoCrawl) { $this->crawl(); } else { throw new LogicException("Crawl hasn't been executed yet, and autoCrawl is set to false"); } } /** * Re-execute the URL processor on all the fetched URLs * @return void */ public function reprocessUrls() { if ($this->urls === null) { $this->loadUrls(); } // Clear out all inferred URLs; these will be added $this->urls['inferred'] = array(); // Reprocess URLs, in case the processing has changed since the last crawl foreach ($this->urls['regular'] as $url => $oldProcessed) { $processedURL = $this->generateProcessedURL($url); $this->urls['regular'][$url] = $processedURL; // Trigger parent URL back-filling on new processed URL $this->parentProcessedURL($processedURL); } $this->saveURLs(); } /** * * @param int $limit * @param bool $verbose * @return \StaticSiteCrawler */ public function crawl($limit=false, $verbose=false) { increase_time_limit_to(3600); if (!is_dir($this->cacheDir)) { mkdir($this->cacheDir); } $crawler = new StaticSiteCrawler($this, $limit, $verbose); $crawler->enableResumption(); $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE); $crawler->setWorkingDirectory($this->cacheDir); // Allow for resuming an incomplete crawl if (file_exists($this->cacheDir.'crawlerid')) { // We should re-load the partial list of URLs, if relevant // This should only happen when we are resuming a partial crawl if (file_exists($this->cacheDir . 'urls')) { $this->urls = unserialize(file_get_contents($this->cacheDir . 'urls')); } else { $this->urls = array('regular' => array(), 'inferred' => array()); } $crawlerID = file_get_contents($this->cacheDir.'crawlerid'); $crawler->resume($crawlerID); } else { $crawlerID = $crawler->getCrawlerId(); file_put_contents($this->cacheDir.'/crawlerid', $crawlerID); $this->urls = array('regular' => array(), 'inferred' => array()); } $crawler->setURL($this->baseURL); $crawler->go(); unlink($this->cacheDir.'crawlerid'); ksort($this->urls['regular']); ksort($this->urls['inferred']); $this->saveURLs(); return $crawler; } /** * Save the current list of URLs to disk * @return [type] [description] */ public function saveURLs() { file_put_contents($this->cacheDir . 'urls', serialize($this->urls)); } /** * Add a URL to this list, given the absolute URL * @param string $url The absolute URL */ public function addAbsoluteURL($url) { $simpifiedURL = $this->simplifyURL($url); $simpifiedBase = $this->simplifyURL($this->baseURL); if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { $relURL = substr($url, strlen($this->baseURL)); } else { throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); } return $this->addURL($relURL); } public function addURL($url) { if ($this->urls === null) { $this->loadUrls(); } // Generate and save the processed URLs $this->urls['regular'][$url] = $this->generateProcessedURL($url); // Trigger parent URL back-filling $this->parentProcessedURL($this->urls['regular'][$url]); } /** * Add an inferred URL to the list. * * Since the unprocessed URL isn't available, we use the processed URL in its place. This should be used with * some caution. * * @param string $processedURL The processed URL to add. */ public function addInferredURL($inferredURL) { if ($this->urls === null) { $this->loadUrls(); } // Generate and save the processed URLs $this->urls['inferred'][$inferredURL] = $inferredURL; // Trigger parent URL back-filling $this->parentProcessedURL($inferredURL); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Return true if the given URL exists * @param string $url The URL, either absolute, or relative starting with "/" * @return boolean Does the URL exist */ public function hasURL($url) { if ($this->urls === null) { $this->loadUrls(); } // Try and relativise an absolute URL if ($url[0] != '/') { $simpifiedURL = $this->simplifyURL($url); $simpifiedBase = $this->simplifyURL($this->baseURL); if (substr($simpifiedURL, 0, strlen($simpifiedBase)) == $simpifiedBase) { $url = substr($simpifiedURL, strlen($simpifiedBase)); } else { throw new InvalidArgumentException("URL $url is not from the site $this->baseURL"); } } return isset($this->urls['regular'][$url]) || in_array($url, $this->urls['inferred']); } /** * Simplify a URL. * Ignores https/http differences and "www." / non differences. * * @param string $url * @return string */ protected function simplifyURL($url) { return preg_replace('#^https?://(www\.)?#i', 'http://www.', $url); } /** * Returns true if the given URL is in the list of processed URls * * @param string $processedURL The processed URL * @return boolean True if it exists, false otherwise */ public function hasProcessedURL($processedURL) { if ($this->urls === null) { $this->loadUrls(); } return in_array($processedURL, $this->urls['regular']) || in_array($processedURL, $this->urls['inferred']); } /** * Return the processed URL that is the parent of the given one. * * Both input and output are processed URLs * * @param string $url A relative URL * @return string [description] */ public function parentProcessedURL($processedURL) { if ($processedURL == "/") { return ""; } // URL heirachy can be broken down by querystring or by URL $breakpoint = max(strrpos($processedURL, '?'), strrpos($processedURL, '/')); // Special case for children of the root if ($breakpoint == 0) { return "/"; } // Get parent URL $parentProcessedURL = substr($processedURL, 0, $breakpoint); // If an intermediary URL doesn't exist, create it if (!$this->hasProcessedURL($parentProcessedURL)) { $this->addInferredURL($parentProcessedURL); } return $parentProcessedURL; } /** * Return the regular URL, given the processed one. * * Note that the URL processing isn't reversible, so this function works looks by iterating through all URLs. * If the URL doesn't exist in the list, this function returns null. * * @param string $processedURL The URL after processing has been applied. * @return string The original URL. */ public function unprocessedURL($processedURL) { if ($url = array_search($processedURL, $this->urls['regular'])) { return $url; } elseif (in_array($processedURL, $this->urls['inferred'])) { return $processedURL; } else { return null; } } /** * Find the processed URL in the URL list * @param [type] $url [description] * @return [type] [description] */ public function processedURL($url) { if ($this->urls === null) { $this->loadUrls(); } if (isset($this->urls['regular'][$url])) { // Generate it if missing if ($this->urls['regular'][$url] === true) { $this->urls['regular'][$url] = $this->generateProcessedURL($url); } return $this->urls['regular'][$url]; } elseif (in_array($url, $this->urls['inferred'])) { return $url; } } /** * Execute custom logic for processing URLs prior to heirachy generation. * * This can be used to implement logic such as ignoring the "/Pages/" parts of MOSS URLs, or dropping extensions. * * @param string $url The unprocessed URL * @return string The processed URL */ public function generateProcessedURL($url) { if (!$url) { throw new LogicException("Can't pass a blank URL to generateProcessedURL"); } if ($this->urlProcessor) { $url = $this->urlProcessor->processURL($url); } if (!$url) { throw new LogicException(get_class($this->urlProcessor) . " returned a blank URL."); } return $url; } /** * Return the URLs that are a child of the given URL * @param [type] $url [description] * @return [type] [description] */ public function getChildren($url) { if ($this->urls === null) { $this->loadUrls(); } $processedURL = $this->processedURL($url); // Subtly different regex if the URL ends in ? or / if (preg_match('#[/?]$#', $processedURL)) { $regEx = '#^'.preg_quote($processedURL, '#') . '[^/?]+$#'; } else { $regEx = '#^'.preg_quote($processedURL, '#') . '[/?][^/?]+$#'; } $children = array(); foreach ($this->urls['regular'] as $potentialChild => $potentialProcessedChild) { if (preg_match($regEx, $potentialProcessedChild)) { if (!isset($children[$potentialProcessedChild])) { $children[$potentialProcessedChild] = $potentialChild; } } } foreach ($this->urls['inferred'] as $potentialProcessedChild) { if (preg_match($regEx, $potentialProcessedChild)) { if (!isset($children[$potentialProcessedChild])) { $children[$potentialProcessedChild] = $potentialProcessedChild; } } } return array_values($children); } } class StaticSiteCrawler extends PHPCrawler { protected $urlList; /** * * @var bool */ protected $verbose = false; public function __construct(StaticSiteUrlList $urlList, $limit=false, $verbose=false) { parent::__construct(); $this->urlList = $urlList; $this->verbose = $verbose; if ($limit) { $this->setPageLimit($limit); } } public function handleHeaderInfo(PHPCrawlerResponseHeader $header) { // Don't parse 400/500 responses if ($header->http_status_code > 399) { $message = $header->source_url . " - skipped as it's $header->http_status_code".PHP_EOL; error_log($message, 3, '/tmp/urls'); if ($this->verbose) { echo "[!] ".$message; } return -1; } } public function handleDocumentInfo(PHPCrawlerDocumentInfo $info) { // Ignore errors and redirects if ($info->http_status_code < 200) { return; } if ($info->http_status_code > 299) { return; } // Ignore non HTML if (!preg_match('#/x?html#', $info->content_type)) { return; } $this->urlList->addAbsoluteURL($info->url); if ($this->verbose) { echo "[+] ".$info->url.PHP_EOL; } $this->urlList->saveURLs(); } protected function initCrawlerProcess() { parent::initCrawlerProcess(); // Add additional URLs to crawl to the crawler's LinkCache // NOTE: This is using an undocumented API if ($extraURLs = $this->urlList->getExtraCrawlURLs()) { foreach ($extraURLs as $extraURL) { $this->LinkCache->addUrl(new PHPCrawlerURLDescriptor($extraURL)); } } // Prevent URLs that matches the exclude patterns to be fetched if ($excludePatterns = $this->urlList->getExcludePatterns()) { foreach ($excludePatterns as $pattern) { $validRegExp = $this->addURLFilterRule('|'.str_replace('|', '\|', $pattern).'|'); if (!$validRegExp) { throw new InvalidArgumentException('Exclude url pattern "'.$pattern.'" is not a valid regular expression.'); } } } } } |