Skip to content

Commit

Permalink
TASK: Improve memory handling in crawler and simplify code
Browse files Browse the repository at this point in the history
  • Loading branch information
gradinarufelix committed Jun 8, 2024
1 parent 46d8210 commit 2710bed
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 39 deletions.
2 changes: 1 addition & 1 deletion Classes/Command/CheckLinksCommandController.php
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ private function crawlNodesCommandImplementation(array $domainsToCrawl, int &$er
foreach ($messages as $message) {
$this->output->outputFormatted('<error>' . $message . '</error>');
}
$this->output->outputLine("Problems: " . \count($messages));
$this->output->outputLine(sprintf("Problems for domain %s: %s", $domainToCrawl->__toString(), \count($messages)));
}

if ($restoreBaseUriProviderSingleton) {
Expand Down
86 changes: 49 additions & 37 deletions Classes/Domain/Crawler/ContentNodeCrawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@

use CodeQ\LinkChecker\Domain\Model\ResultItemRepositoryInterface;
use CodeQ\LinkChecker\Domain\Model\ResultItem;
use Neos\ContentRepository\Domain\Model\Node;
use Neos\ContentRepository\Domain\Model\NodeInterface;
use Neos\ContentRepository\Domain\NodeAggregate\NodeAggregateIdentifier;
use Neos\ContentRepository\Domain\Projection\Content\TraversableNodeInterface;
use Neos\ContentRepository\Domain\Repository\NodeDataRepository;
use Neos\ContentRepository\Domain\Service\ContextFactoryInterface;
use Neos\Eel\FlowQuery\FlowQuery;
use Neos\ContentRepository\Exception\NodeException;
use Neos\Flow\Annotations as Flow;
use Neos\Flow\Mvc\Routing\RouterInterface;
use Neos\Neos\Domain\Model\Domain;
Expand Down Expand Up @@ -58,49 +59,33 @@ class ContentNodeCrawler

public function crawl(ContentContext $subgraph, Domain $domain): array
{
/** @var Node[] $allContentAndDocumentNodes */
$allContentAndDocumentNodes = FlowQuery::q([$subgraph->getCurrentSiteNode()])
->find('[instanceof Neos.Neos:Document],[instanceof Neos.Neos:Content]')->get();

$messages = [];

foreach ($allContentAndDocumentNodes as $node) {
if (!$this->findIsNodeVisible($node)) {
continue;
}

$unresolvedUris = [];
$invalidPhoneNumbers = [];

// todo why use nodeData here and not the node?
$properties = $node->getNodeData()->getProperties();

foreach ($properties as $property) {
$this->crawlPropertyForNodesAndAssets($property, $subgraph, $unresolvedUris);
$this->crawlPropertyForTelephoneNumbers($property, $invalidPhoneNumbers);
}
$currentSiteNode = $subgraph->getCurrentSiteNode();
$this->crawlNode($currentSiteNode, $subgraph, $domain, $messages);
$this->crawlChildNodesRecursively($currentSiteNode, $subgraph, $domain, $messages);

foreach ($unresolvedUris as $uri) {
$messages[] = 'Not found: ' . $uri;
return $messages;
}

$this->createResultItem($subgraph, $domain, $node, $uri, 404);
}
foreach ($invalidPhoneNumbers as $phoneNumber) {
$messages[] = 'Invalid format: ' . $phoneNumber;
protected function crawlChildNodesRecursively(NodeInterface|TraversableNodeInterface $rootNode, ContentContext $subgraph, Domain $domain, array &$messages): void
{
$childNodes = $rootNode->findChildNodes();

/* @see https://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml - 490 is unassigned, and so we can use it */
$this->createResultItem($subgraph, $domain, $node, $phoneNumber, 490);
}
foreach ($childNodes as $node) {
$this->crawlNode($node, $subgraph, $domain, $messages);
$this->crawlChildNodesRecursively($node, $subgraph, $domain, $messages);
}

return $messages;
// Free memory
unset($childNodes);
}

/**
* @see \Neos\Neos\Fusion\ConvertUrisImplementation::evaluate
*/
protected function crawlPropertyForNodesAndAssets(
$property,
mixed $property,
ContentContext $subgraph,
array &$unresolvedUris
): void {
Expand Down Expand Up @@ -179,7 +164,7 @@ static function (array $matches) use (&$invalidPhoneNumbers) {
protected function createResultItem(
ContentContext $subgraph,
Domain $domain,
Node $node,
NodeInterface|TraversableNodeInterface $node,
string $uri,
int $statusCode
): void {
Expand Down Expand Up @@ -213,20 +198,21 @@ protected function createResultItem(
$this->resultItemRepository->add($resultItem);
}

private function findClosestDocumentNode(Node $node): Node
private function findClosestDocumentNode(NodeInterface|TraversableNodeInterface $node): NodeInterface|TraversableNodeInterface
{
while ($node->getNodeType()->isOfType('Neos.Neos:Document') === false) {
$node = $node->findParentNode();
}
return $node;
}

private function findIsNodeVisible(Node $node): bool
private function findIsNodeVisible(NodeInterface|TraversableNodeInterface $node): bool
{
do {
$previousNode = $node;
$node = $node->getParent();
if ($node === null) {
try {
$node = $node->findParentNode();
} catch (NodeException) {
if ($previousNode->isRoot()) {
return true;
}
Expand All @@ -242,4 +228,30 @@ private function subgraphWithConfiguration(ContentContext $currentSubgraph, arra
$newSubgraph = $this->contextFactory->create(array_merge($currentConfiguration, $additionalConfiguration));
return $newSubgraph;
}

protected function crawlNode(NodeInterface|TraversableNodeInterface $node, ContentContext $subgraph, Domain $domain, array &$messages): void
{
$unresolvedUris = [];
$invalidPhoneNumbers = [];

// todo why use nodeData here and not the node?
$properties = $node->getNodeData()->getProperties();

foreach ($properties as $property) {
$this->crawlPropertyForNodesAndAssets($property, $subgraph, $unresolvedUris);
$this->crawlPropertyForTelephoneNumbers($property, $invalidPhoneNumbers);
}

foreach ($unresolvedUris as $uri) {
$messages[] = 'Not found: '.$uri;

$this->createResultItem($subgraph, $domain, $node, $uri, 404);
}
foreach ($invalidPhoneNumbers as $phoneNumber) {
$messages[] = 'Invalid format: '.$phoneNumber;

/* @see https://www.iana.org/assignments/http-status-codes/http-status-codes.xhtml - 490 is unassigned, and so we can use it */
$this->createResultItem($subgraph, $domain, $node, $phoneNumber, 490);
}
}
}
2 changes: 1 addition & 1 deletion Classes/Infrastructure/DomainService.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class DomainService
/**
* @return Domain[]
*/
public function findAllSitesPrimaryDomain()
public function findAllSitesPrimaryDomain(): array
{
/** @var Site[] $sites */
$sites = $this->siteRepository->findAll()->toArray();
Expand Down

0 comments on commit 2710bed

Please sign in to comment.