Skip to content

Commit

Permalink
feat: add file readers
Browse files Browse the repository at this point in the history
  • Loading branch information
bernard-ng committed Jul 29, 2024
1 parent c227348 commit b8e9276
Show file tree
Hide file tree
Showing 22 changed files with 765 additions and 9 deletions.
4 changes: 3 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
"symfony/serializer": "^7.1",
"symfony/property-access": "^7.1",
"webmozart/assert": "^1.11",
"league/commonmark": "^2.4"
"league/commonmark": "^2.4",
"smalot/pdfparser": "^2.10",
"symfony/filesystem": "^7.1"
},
"require-dev": {
"phpunit/phpunit": "^11.1",
Expand Down
119 changes: 118 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@

declare(strict_types=1);

namespace Devscast\Lugha\Retrieval\Loader;
namespace Devscast\Lugha\Retrieval\Loader\Directory;

use Devscast\Lugha\Retrieval\Document;
use Devscast\Lugha\Retrieval\Loader\LoaderInterface;
use Devscast\Lugha\Retrieval\Metadata;
use Devscast\Lugha\Retrieval\Splitter\SplitterInterface;

/**
Expand All @@ -25,24 +27,38 @@
readonly class DirectoryLoader implements LoaderInterface
{
public function __construct(
public string $directory,
public ?string $glob = null
public string $path
) {
}

/**
* @return iterable<Document>
*/
#[\Override]
public function load(): iterable
{
return [];
/** @var RecursiveDirectoryIterator|\DirectoryIterator $file */
foreach (new WildcardDirectoryIterator($this->path) as $file) {
if ($file->isFile()) {
yield new Document(
content: (string) file_get_contents($file->getPathname()),
metadata: new Metadata(
sourceType: 'file',
sourceName: $file->getFilename(),
),
);
}
}
}

/**
* @return iterable<Document>
*/
#[\Override]
public function loadAndSplit(SplitterInterface $splitter): iterable
{
return [];
foreach ($this->load() as $document) {
yield from $splitter->createDocuments($document);
}
}
}
33 changes: 33 additions & 0 deletions src/Retrieval/Loader/Directory/RecursiveDirectoryIterator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?php

/*
* This file is part of the Lugha package.
*
* (c) Bernard Ngandu <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

declare(strict_types=1);

namespace Devscast\Lugha\Retrieval\Loader\Directory;

use RecursiveDirectoryIterator as NativeRecursiveDirectoryIterator;
use RecursiveIteratorIterator;

/**
* Class RealRecursiveDirectoryIterator.
*
* @extends RecursiveIteratorIterator<NativeRecursiveDirectoryIterator>
* @see https://www.php.net/manual/en/class.recursivedirectoryiterator.php
*
* @author bernard-ng <[email protected]>
*/
final class RecursiveDirectoryIterator extends RecursiveIteratorIterator
{
public function __construct(string $path)
{
parent::__construct(new NativeRecursiveDirectoryIterator($path));
}
}
62 changes: 62 additions & 0 deletions src/Retrieval/Loader/Directory/WildcardDirectoryIterator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

/*
* This file is part of the Lugha package.
*
* (c) Bernard Ngandu <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

declare(strict_types=1);

namespace Devscast\Lugha\Retrieval\Loader\Directory;

use DirectoryIterator;
use FilterIterator;

/**
* Class WildcardDirectoryIterator.
*
* @extends FilterIterator<int, string, RecursiveDirectoryIterator|DirectoryIterator>
*
* @author bernard-ng <[email protected]>
*/
final class WildcardDirectoryIterator extends FilterIterator
{
private string $regex;

public function __construct(string $path)
{
$recursive = false;

if (str_starts_with($path, '-R ')) {
$recursive = true;
$path = substr($path, 3);
}

if (preg_match('~/?([^/]*\*[^/]*)$~', $path, $matches)) { // matched wildcards in filename
$path = substr($path, 0, -strlen($matches[1]) - 1); // strip wildcards part from path
$this->regex = '~^' . str_replace('*', '.*', str_replace('.', '\.', $matches[1])) . '$~'; // convert wildcards to regex

if (! $path) {
$path = '.'; // if no path given, we assume CWD;
}
}

parent::__construct($recursive ? new RecursiveDirectoryIterator($path) : new DirectoryIterator($path));
}

/**
* Checks for regex in current filename, or matches all if no regex specified
*/
#[\Override]
public function accept(): bool
{
/** @var RecursiveDirectoryIterator|DirectoryIterator $iterator */
$iterator = $this->getInnerIterator();

return (bool) preg_match($this->regex, $iterator->getFilename());
}
}
64 changes: 64 additions & 0 deletions src/Retrieval/Loader/Reader/AbstractReader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

/*
* This file is part of the Lugha package.
*
* (c) Bernard Ngandu <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

declare(strict_types=1);

namespace Devscast\Lugha\Retrieval\Loader\Reader;

use Devscast\Lugha\Retrieval\Loader\Reader\Exception\FileNotFoundException;
use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnreadableFileException;
use Devscast\Lugha\Retrieval\Loader\Reader\Exception\UnsupportedFileException;
use Symfony\Component\Filesystem\Path;

/**
* Interface AbstractReader.
*
* @author bernard-ng <[email protected]>
*/
abstract readonly class AbstractReader
{
/**
* Supported extensions regex pattern
*/
public const string SUPPORTED_EXTENSIONS_PATTERN = '';

/**
* @throws UnsupportedFileException If the file extension is not supported and the check is not skipped.
* @throws UnreadableFileException When the content cannot be read for any other reason
* @throws FileNotFoundException When the given file does not exist
*/
abstract public function readContent(string $path, bool $skipExtensionCheck = false): string;

final public function supports(string $path): bool
{
$extension = Path::getExtension($path, forceLowerCase: true);
return (bool) preg_match(static::SUPPORTED_EXTENSIONS_PATTERN, $extension);
}

final public function ensureSupported(string $path): void
{
$extension = Path::getExtension($path, forceLowerCase: true);
if ($this->supports($path) === false) {
throw new UnsupportedFileException([$extension, static::SUPPORTED_EXTENSIONS_PATTERN]);
}
}

final public function ensureFileExists(string $path): void
{
if (file_exists($path) === false) {
throw new FileNotFoundException($path);
}

if (is_readable($path) === false) {
throw new UnreadableFileException($path);
}
}
}
Loading

0 comments on commit b8e9276

Please sign in to comment.