-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcover-analyzer.py
132 lines (109 loc) · 4.95 KB
/
cover-analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Takes a list of Archive identifiers via stdin, outputs an HTML page
# containing a table listing cover images and calculated "usefulness".
# Also writes a text file containing the identifiers of items with
# less useful covers and which should probably use their title page as
# the thumbnail.
# TODO: Examine the actual cover images, instead of the existing thumb
# cuz the existing thumb could already be the title page.
from wand.image import Image
from urllib.request import urlopen
import numpy
import sys
# Configurable parameters
grid_width = 16 # Width of grid to shrink to for color checking
grid_height = 16 # Height of grid to shrink to for color checking
side_length_fraction_to_keep = 0.6 # What fraction of each dimension to keep (rest is cropped)
human_scale_factor = 255 # Desired scale of RGB values, e.g. 0-255, for readability
threshold_deviation = 12.0 # Max standard deviation for indicating cloth cover
file_of_ids_to_use_title_page = 'title_page_ids.txt'
sys.stderr.write('Starting')
title_page_ids_file = open(file_of_ids_to_use_title_page, 'w')
item_outputs = []
for line in sys.stdin:
item_output = []
sys.stderr.write('.')
sys.stderr.flush()
identifier = line.rstrip('\n')
url = 'https://archive.org/services/img/{}'.format(identifier)
item_output.append('<img src={} alt="">'.format(url))
try:
with urlopen(url) as response:
# Handle placeholder images
if response.geturl() == 'https://archive.org/images/notfound.png':
item_output.append('<span class="not-useful">not useful</span>')
item_output.append('<a href="https://archive.org/details/{0}">{0}</a>'.format(identifier))
item_output.append('') # Needed to keep number of output cells constant
item_outputs.append(item_output)
continue
with Image(file=response) as image:
(image_width, image_height) = image.size
# Use the center section to avoid barcodes etc. placed close to edges
image.crop(
width=int(round(side_length_fraction_to_keep * image_width)),
height=int(round(side_length_fraction_to_keep * image_height)),
gravity='center'
)
# Switch to coarse grid, allowing ImageMagick to average pixels
image.resize(grid_width, grid_height)
# Gather all pixel color values
reds = []
greens = []
blues = []
for image_row in image:
for pixel in image_row:
reds.append(pixel.red * human_scale_factor)
greens.append(pixel.green * human_scale_factor)
blues.append(pixel.blue * human_scale_factor)
# Call it not useful (i.e. cloth cover) if all channels show little variation
deviations = (numpy.std(reds), numpy.std(greens), numpy.std(blues))
if all([deviations[i] < threshold_deviation for i in [0, 1, 2]]):
item_output.append('<span class="not-useful">not useful</span>')
title_page_ids_file.write(identifier + '\n')
else:
item_output.append('<span class="useful">USEFUL</span>')
# Output identifier third in row (giving priority to cover image and status)
item_output.append('<a href="https://archive.org/details/{0}">{0}</a>'.format(identifier))
# Show standard deviation for each channel (for diagnostics)
item_output.append('({:.2f}, {:.2f}, {:.2f})'.format(deviations[0], deviations[1], deviations[2]))
except Exception as e:
item_output.append('<span class="error">error reading</span>')
item_output.append('<a href="https://archive.org/details/{0}">{0}</a>'.format(identifier))
item_output.append('') # Needed to keep number of output cells constant
sys.stderr.write('\nERROR with image for {identifier}: {error}\n'.format(identifier=identifier, error=str(e)))
item_outputs.append(item_output)
title_page_ids_file.close()
# Output HTML
print("""
<!DOCTYPE html>
<html>
<head>
<title>Cover Analyzer</title>
<style>
body { font-family: sans-serif; }
th { text-align: left; }
td { padding: 4px; }
.useful { font-weight: bold; }
.not-useful { color: gray }
.error { color: red }
</style>
</head>
<body>
<table>
<tr>
<th>Cover Image</th>
<th>Useful?</th>
<th>Identifier</th>
<th>Std Deviation (R, G, B)</th>
</tr>
""")
for item_output in item_outputs:
print('<tr>')
for value in item_output:
print('<td>{}</td>'.format(value))
print('</tr>')
print("""
</table>
</body>
</html>
""")
sys.stderr.write('done\n')