-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-scrape-store-answer.php
150 lines (126 loc) · 4.72 KB
/
01-scrape-store-answer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
<?php
declare(strict_types=1);
/*
* This file is part of PHP LLM Documents.
*
* (c) Thomas Joußen <[email protected]>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
use Google\Client;
use Thojou\LLMDocuments\Crawler\Spatie\SpatieCrawlerConfig;
use Thojou\LLMDocuments\Crawler\Spatie\SpatieCrawlerFactory;
use Thojou\LLMDocuments\Document\Document;
use Thojou\LLMDocuments\Document\DocumentInterface;
use Thojou\LLMDocuments\Embedding\OpenAi\OpenAiEmbeddings;
use Thojou\LLMDocuments\Loader\WebSearchLoader;
use Thojou\LLMDocuments\Parser\Unstructured\Api\UnstructuredAPI;
use Thojou\LLMDocuments\Parser\Unstructured\UnstructuredParserFactory;
use Thojou\LLMDocuments\Retriever\SimilarityRetriever;
use Thojou\LLMDocuments\Search\Google\GoogleSearchEngineFactory;
use Thojou\LLMDocuments\Splitter\RecursiveTextSplitter;
use Thojou\LLMDocuments\Storage\VectorStore\LocalVectorStore;
use Thojou\LLMDocuments\Transformation\DocumentTransformationBuilder;
use Thojou\LLMDocuments\ValueObjects\DoctranConfig;
use Thojou\OpenAi\OpenAi;
use Thojou\SimpleApiClient\Adapter\GuzzleClientFactory;
require_once __DIR__ . '/../vendor/autoload.php';
require_once __DIR__ . '/credentials.php';
// INPUT
$query = $argv[1];
$num = isset($argv[2]) ? (int)$argv[2] : 2;
$threshold = isset($argv[3]) ? (float)$argv[3] : 0.75;
// Define the OpenAI API Interface
$openAI = new OpenAi(
OPENAI_KEY,
);
// Define the WebSearchLoader
$webSearchLoader = new WebSearchLoader(
// Use Google API for WebSearch
new GoogleSearchEngineFactory(
new Client([
'application_name' => 'BrAin/1.0',
'developer_key' => GOOGLE_DEVELOPER_KEY,
]),
SEARCH_ENGINE_ID,
true
),
// Use SpatieCrawler for WebCrawling
new SpatieCrawlerFactory(
(new SpatieCrawlerConfig())
->setRemoteInstance(gethostbyname('chromium'), 9222)
->setEnableJavascript(true)
->setNodeBinary('/root/.nvm/versions/node/v18.17.0/bin/node')
->setNpmBinary('/root/.nvm/versions/node/v18.17.0/bin/npm')
),
// Use UnstructuredParser for parsing the crawled websites into raw text
new UnstructuredParserFactory(
new UnstructuredAPI(
new GuzzleClientFactory('http://unstructured:8000', 'DemoAI')
)
),
$num
);
// Define the SimilarityRetriever to store and find similar documents
$retriever = new SimilarityRetriever(
new LocalVectorStore(
'/tmp/test.json',
new OpenAiEmbeddings($openAI, 'text-embedding-ada-002'),
),
$threshold,
4,
new RecursiveTextSplitter(
chunkSize: 512,
chunkOverlap: 128
)
);
// Define the DocumentTransformationBuilder to summarize the final context documents
$summarizer = (new DocumentTransformationBuilder(
new DoctranConfig($openAI, 'gpt-3.5-turbo', 4000)
))->summarize(2000);
/**
* @param array<DocumentInterface> $contextDocuments
*
* @return DocumentInterface
*/
function combineDocuments(array $contextDocuments): DocumentInterface
{
$text = "";
foreach($contextDocuments as $key => $document) {
$text .= "DOCUMENT $key\n";
$text .= $document->getPageContent() . "\n";
$text .= "METADATA\n";
$text .= json_encode($document->getMetadata(), JSON_PRETTY_PRINT) . "\n";
$text .= "\n";
}
return new Document($text);
}
// Search the Web for the query
echo "Start searching the web for '$query'\n";
$documents = $webSearchLoader->load($query);
foreach($documents as $key => $document) {
echo "Start summarizing the context\n";
$documents[$key] = $summarizer->execute($document);
}
echo "Found " . count($documents) . " documents\n";
echo "Start adding documents to the retriever\n";
$retriever->addDocuments($documents);
echo "Start retrieving relevant documents\n";
$contextDocuments = $retriever->getRelevantDocuments($query);
echo "Start combining documents\n";
$document = combineDocuments($contextDocuments);
echo "Start chat with OpenAI\n";
$context = $document->getPageContent();
$messages = [
'model' => 'gpt-3.5-turbo',
'messages' => [
['role' => 'system', 'content' => "Respond to the user's query using only the information provided in the given context. If you lack the necessary information to answer the question, reply with 'I don't know'. Please include at least one source as link in your response."],
['role' => 'user', 'content' => "CONTEXT INFORMATION:\n===\n$context\n===\nUSER QUERY: $query\n===\nRESPONSE:"],
]
];
echo json_encode($messages, JSON_PRETTY_PRINT) . "\n";
$response = $openAI->chat()->completion($messages);
echo "User QUERY: $query\n";
echo "RESPONSE:\n";
echo $response['choices'][0]['message']['content'] . "\n";