-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate-descriptions.py
126 lines (93 loc) · 4.3 KB
/
generate-descriptions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from __future__ import print_function
# Import libraries
import time
import requests
import operator
import numpy as np
import pandas as pd
import xmltodict
# Variables for the Computer Vision
_region = 'francecentral' #Here you enter the region of your subscription
_url = 'https://{}.api.cognitive.microsoft.com/vision/v2.0/analyze'.format(_region)
_key = None #Here you have to paste your primary key
_maxNumRetries = 10
global urlImage #Here we will store the image URL
global json #This will be part of the request to the Computer Vision
# Set the Computer Vision parameters
params = { 'visualFeatures' : 'Description'}
headers = dict()
headers['Ocp-Apim-Subscription-Key'] = _key
headers['Content-Type'] = 'application/json'
# Open the Media XML file that was generated with WordPress Export
with open('designsampsigns.wordpress.2019-02-05_test.xml') as fd: #Using _test.xml file for testing
raw = xmltodict.parse(fd.read())
# Create a list from the items in the XML containing title, url and description
data_x = [[r["title"], r["guid"]["#text"], r["description"]] for r in raw["rss"]["channel"]["item"]]
# Add confidence to the list for each item
for k in data_x:
k.append("confidence")
print("Number of images", len(data_x))
# Does the actual results request to the Computer Vision API
def processRequest( json, data, headers, params ):
"""
Helper function to process the request to Project Oxford
Parameters:
json: Used when processing images from its URL. See API Documentation
data: Used when processing image read from disk. See API Documentation
headers: Used to pass the key information and the data type request
"""
retries = 0
result = None
while True:
response = requests.request( 'post', _url, json = json, data = data, headers = headers, params = params )
if response.status_code == 429:
print( "Message: %s" % ( response.json() ) )
if retries <= _maxNumRetries:
time.sleep(1)
retries += 1
continue
else:
print( 'Error: failed after retrying!' )
break
elif response.status_code == 200 or response.status_code == 201:
if 'content-length' in response.headers and int(response.headers['content-length']) == 0:
result = None
elif 'content-type' in response.headers and isinstance(response.headers['content-type'], str):
if 'application/json' in response.headers['content-type'].lower():
result = response.json() if response.content else None
elif 'image' in response.headers['content-type'].lower():
result = response.content
else:
print( "Error code: %d" % ( response.status_code ) )
print( "Message: %s" % ( response.json() ) )
break
return result
# Loop in the list and send the request to the Computer Vision
for k in data_x:
if k[2] is None: #Call the API only if the description is missing
print(k[1]) #Print URL of the image
urlImage = k[1]
json = { 'url': urlImage } #Here is the json for sending out the request
data = None
result = processRequest( json, data, headers, params )
print(result)
# Extract the caption if available and the confidence value
# Extract the caption if available and the confidence value
if result is not None:
if 'description' in result:
try: #Error handling
description = result['description']['captions'][0]['text']
confidence = result['description']['captions'][0]['confidence']
k[2] = description #Replace Description value
k[3] = confidence #Replace Description value
except (IndexError, ValueError):
description = 'null'
print(ValueError)
print(k[2]) #Print Description
else:
print("Description available:", k[2])
k[3] = None #Leave confidence blank
# Save results to a CSV
df = pd.DataFrame(data_x, columns=["title", "url", "description", "confidence"])
df.to_csv("out.csv", encoding='utf-8', index=False)
print("Results saved on CSV")