-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathspamfilter.php
298 lines (254 loc) · 9.9 KB
/
spamfilter.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
<?php
// spamfilter.php -- Filter through text, searching for spam
// Copyright (C) 2013 Andreas Renberg <[email protected]>
//
// This program is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 3, as
// published by the Free Software Foundation.
//
// This program is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
//
// You should have received a copy of the GNU General Public License along
// with this program; if not, see <http://www.gnu.org/licences/>
class SpamFilter
{
/* $blacklists can be one of the following options
* null: uses the default blacklist folder
* a string: a path to a custom blacklist folder
* an array of strings: Each string should point to a blacklist file
*
* $blacklist_update_url can either be a url, or a local path. (the latter is yet untested)
*/
public function __construct($blacklists = null, $blacklist_update_url = null)
{
if (is_array($blacklists))
{
$this->blacklist_directory = null;
$this->blacklists = $blacklists;
}
elseif ($blacklists === null)
{
$blacklists = SpamFilter::default_blacklist_directory();
$this->blacklist_directory = $blacklists;
$this->blacklists = $this->get_blacklists_from_directory($blacklists);
}
elseif (is_string($blacklists))
{
$this->blacklist_directory = $blacklists;
$this->blacklists = $this->get_blacklists_from_directory($blacklists);
}
else
{
// Is this the proper way to throw errors in PHP?
trigger_error("[SpamFilter::__construct()] Error: Invalid value for parameter \$blacklist.");
$this->blacklist_directory = null;
$this->blacklists = array();
}
if ($blacklist_update_url === null)
{
$this->blacklist_update_url = SpamFilter::default_blacklist_update_url();
}
else
{
$this->blacklist_update_url = $blacklist_update_url;
}
}
private function get_blacklists_from_directory($blacklist_directory)
{
$blacklist_index = $blacklist_directory . DIRECTORY_SEPARATOR . 'index';
if (!file_exists($blacklist_index))
{
// Is this the proper way to throw errors in PHP?
trigger_error("[SpamFilter::__construct()] Error: Cannot find blacklist index in `$blacklist_directory`.");
return array();
}
else
{
$index = $blacklist_directory . DIRECTORY_SEPARATOR . 'index';
return $this->get_list_from_file($index);
}
}
private function get_list_from_file($file_path)
{
$file_contents = file_get_contents($file_path);
return preg_split("/((\r?\n)|(\r\n?))/", $file_contents, NULL, PREG_SPLIT_NO_EMPTY);
}
public static function default_blacklist_directory()
{
return dirname(__FILE__) . DIRECTORY_SEPARATOR . 'blacklists'; // absolute path
}
public static function default_blacklist_update_url()
{
return "https://raw.github.com/IQAndreas/php-spam-filter/blacklists/";
}
private $blacklist_directory;
private $blacklist_update_url;
private $blacklists;
public function check_text($text)
{
foreach ($this->blacklists as $blacklist_filename)
{
$match = $this->regex_match_from_blacklist($text, $blacklist_filename);
if ($match) return $match;
}
}
public function check_url($url)
{
// TODO! Just treat the url as plain text for now.
return $this->check_text($url, $blacklist);
}
private function regex_match_from_blacklist($text, $blacklist)
{
if (!file_exists($blacklist))
{
$path = $this->blacklist_directory;
if ($path === null) $path = SpamFilter::default_blacklist_directory();
// Check to see if they supplied a relative path instead of an absolute one.
$blacklist_absolute = $path . DIRECTORY_SEPARATOR . $blacklist;
if (file_exists($blacklist_absolute))
{
$blacklist = $blacklist_absolute;
}
else
{
// Is this the proper way to throw errors in PHP?
trigger_error("[SpamFilter::regex_match_from_blacklist()] Error: Cannot find blacklist with name `$blacklist_absolute`.");
return false;
}
}
$keywords = file($blacklist);
$current_line = 0;
$regex_match = array();
foreach($keywords as $regex)
{
$current_line++;
// Remove comments and whitespace before and after a keyword
$regex = preg_replace('/(^\s+|\s+$|\s*#.*$)/i', "", $regex);
if (empty($regex)) continue;
$match = @preg_match("/$regex/i", $text, $regex_match);
if ($match)
{
// Spam found. Return the text that was matched
return $regex_match[0];
}
else if ($match === false)
{
trigger_error("[SpamFilter::regex_match_from_blacklist()] Error: Invalid regular expression in `$blacklist` line $current_line.");
continue;
}
}
// No spam found
return false;
}
// returns `null` if not currently using a valid blacklist directory
public function version()
{
$blacklist_version_file = $this->blacklist_directory . DIRECTORY_SEPARATOR . 'version';
if (file_exists($blacklist_version_file))
{
return trim(file_get_contents($blacklist_version_file));
}
else
{
return null;
}
}
// Returns `true` if an update exists, `false` if using the same version as on the server,
// and `null` if not currently using a valid blacklist directory
public function blacklist_update_available()
{
// Will only check if the version numbers do not match, not if one is newer than the other.
$current_version = $this->version();
if ($current_version === null) return null;
$remote_version = trim(file_get_contents($this->blacklist_update_url . 'version'));
return ($current_version != $remote_version);
}
// WARNING: This will overwrite any of the old files!
// I'm not sure if I should also delete any old blacklists too, or if I should leave them.
// Returns the current blacklist version as a string, or `null` if unable to update.
public function update_blacklists($force = false)
{
// If there is a better way to download stuff from another server, please let me know.
// I'm using this method because there is a limit to how large the files can be, which is good,
// since the blacklists should all be rather small. I really should put some more type of protection in.
if ($this->blacklist_directory === null) return null;
if ($force || ($this->blacklist_update_available() === true))
{
if (!$this->try_blacklist_update())
{
if ($this->blacklist_directory_dirty)
{
trigger_error("[SpamFilter::update_blacklists()] Error: Failure during the middle of an update, which means some of the blacklists may be incomplete. Please fix the problems and re-update the blacklists.");
$blacklist_version_file = $this->blacklist_directory . DIRECTORY_SEPARATOR . 'version';
$version_error_message = "[Previous update failed. Please re-update the blacklists.]";
file_put_contents($blacklist_version_file, $version_error_message);
}
else
{
trigger_error("[SpamFilter::update_blacklists()] Error: Update failed. Reverting to previous blacklist version.");
}
return null;
}
}
// Returns the NEW version
return $this->version();
}
private $blacklist_directory_dirty = false;
private function try_blacklist_update()
{
// Store a reference to the old index of blacklists, as the new index is about to change
$old_blacklists = $this->get_blacklists_from_directory($this->blacklist_directory);
// Download the new index (is also a way to test if the script is able to connect to the download server)
if (!$this->download_blacklist_file('index')) return false;
// Delete old blacklist files
foreach ($old_blacklists as $blacklist_filename)
{
if (!$this->delete_blacklist_file($blacklist_filename))
{
// Just ignore old blacklist files if you are unable to delete them.
// They may get replaced by new files anyway.
trigger_error("[SpamFilter::update_blacklists()] Warning: Unable to remove old blacklist file `$blacklist_filename`. Ignoring and continuing with the update.");
//return false;
}
}
// Loop through index, downloading new blacklist files
$new_blacklists = $this->get_blacklists_from_directory($this->blacklist_directory);
foreach ($new_blacklists as $blacklist_filename)
{
if (!$this->download_blacklist_file($blacklist_filename)) return false;
}
// Finally, download the new version file (will automatically update the current version number)
if (!$this->download_blacklist_file('version')) return false;
$this->blacklist_directory_dirty = false;
return true;
}
private function delete_blacklist_file($blacklist_filename)
{
// Prevent injection which would try to place downloaded files in a different directory
// XXX: If this option is used, you may NOT use absolute paths for the blacklist files!!
// However, that's not much of an issue, as the remote blacklist index will always use relative paths.
$blacklist_filename = basename($blacklist_filename);
$blacklist_file = $this->blacklist_directory . DIRECTORY_SEPARATOR . $blacklist_filename;
if (!file_exists($blacklist_file)) return false; // Cannot find file. Complain or something?
if (!unlink($blacklist_file)) return false; // Cannot delete old file. Complain or something?
$this->blacklist_directory_dirty = true;
return true;
}
private function download_blacklist_file($blacklist_filename)
{
// Prevent injection which would try to place downloaded files in a different directory
// (see comment in `delete_blacklist_file` for details)
$blacklist_filename = basename($blacklist_filename);
$local_filename = $this->blacklist_directory . DIRECTORY_SEPARATOR . $blacklist_filename;
$remote_filename = $this->blacklist_update_url . $blacklist_filename;
$contents = file_get_contents($remote_filename);
if ($contents === false) return false;
$result = file_put_contents($local_filename, $contents);
if ($result === false) return false;
$this->blacklist_directory_dirty = true;
return true;
}
}