-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdigikala-exif-scraper.py
69 lines (67 loc) · 3.21 KB
/
digikala-exif-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import requests
from time import sleep
from datetime import datetime
from PIL import Image, ExifTags, TiffImagePlugin
import os
import hashlib
import json
from tqdm import tqdm
product_id = 1
count = 0
data = {}
try:
while True:
response = requests.get(f"https://api.digikala.com/v1/product/{str(product_id).zfill(7)}/comments/")
status_code = response.status_code
# if response.status_code == 200:
response = response.json()["data"]["media_comments"]
print(f" {product_id}[{str(status_code)}] - started at {datetime.now().strftime('%H:%M:%S')}")
Sresponse = tqdm(total=len(response))
for comment in response:
try:
data[response[count]["id"]] = {"product_id":product_id,"created_at":response[count]["created_at"],"user_name":response[count]["user_name"]}
files = response[count]["files"]
file_count = 0
for file in files:
try:
url = files[file_count]["url"][0]
clear_url = str(url[:str(url).find('?')])
data[response[count]["id"]]["url"] = clear_url
with open("output.txt",'a') as output:
output.write(clear_url+"\n")
try:
with open(f'files/{clear_url[61:]}','wb') as file:
file.write(requests.get(clear_url).content)
img = Image.open(f'files/{clear_url[61:]}')
hash_image = hashlib.md5(img.tobytes()).hexdigest()
data[response[count]["id"]]["hash"] = hash_image
img_exif = img.getexif()
exif_list = {}
for key, val in img_exif.items():
if key in ExifTags.TAGS:
if isinstance(val, TiffImagePlugin.IFDRational):
val = float(val)
elif isinstance(val, tuple):
val = tuple(float(t) if isinstance(t, TiffImagePlugin.IFDRational) else t for t in v)
elif isinstance(val, bytes):
val = val.decode(errors="replace")
exif_list[ExifTags.TAGS[key]] = val
# print(f'{ExifTags.TAGS[key]}: {val}')
data[response[count]["id"]]["exif"] = exif_list
os.remove(f'files/{clear_url[61:]}')
Sresponse.update(1)
except:
pass
file_count += 1
except:
break
count += 1
except:
break
with open("output.json","w") as output_json:
json.dump(data, output_json, indent = 6)
product_id += 1
Sresponse.close()
except KeyboardInterrupt:
with open("output.json","w") as output_json:
json.dump(data, output_json, indent = 6)