-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathim.py
111 lines (87 loc) · 3.74 KB
/
im.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from openpyxl import load_workbook
import requests, json, openpyxl, urllib, os
import pandas as pd
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
# Load excel, delete NaNs, save excel
for root, dirs, files in os.walk(os.getcwd()):
for file in files:
if file.endswith('.xlsx'):
print(file)
df = pd.read_excel(file)
df.dropna(inplace=True)
df.to_excel('prefinal.xlsx')
# load excel, delete rows n cols, save excel
wb = load_workbook('prefinal.xlsx')
ws = wb.active
ws.delete_cols(5, 1)
ws.delete_cols(1, 2)
ws.delete_rows(1,2)
wb.save('final.xlsx')
# convert excel to txt
with open('test.txt', 'w') as file:
pd.read_excel('final.xlsx').to_string(file, index=False)
# create final excel template
excel = openpyxl.Workbook()
sheet = excel.active
sheet.title = 'seznam antiku'
sheet.append(['název', 'autor', 'odkaz', 'cena', 'antik'])
# main starts here
huge_list = []
with open('test.txt', "r") as f:
for line in f:
hg = line.replace('++','').replace('(pseudonym)', '').replace('. ', '').replace('\t', ' ').replace('kolektiv autorů','')
huge_list.append(hg.strip())
def get_organic_results():
for query in huge_list:
query = urllib.parse.quote_plus(query)
#sleep(randint(4,64))
html = requests.get('https://www.ulovknihu.cz/hledat?q={}&state%5B%5D=cz®ion%5B%5D=1®ion%5B%5D=2®ion%5B%5D=3®ion%5B%5D=4®ion%5B%5D=5®ion%5B%5D=6®ion%5B%5D=7®ion%5B%5D=8®ion%5B%5D=9®ion%5B%5D=10®ion%5B%5D=11®ion%5B%5D=12®ion%5B%5D=13®ion%5B%5D=14&stone=0&also_sold=0&sort=2&price_min=&price_max='.format(query), headers=headers, timeout=5).text
soup = BeautifulSoup(html, 'html.parser')
data = []
soup.select('.my-md-row')
try:
title = soup.find('div', {'class': 'my-md-td searchList__product__info'}).find('h2').text
except:
title = query
try:
author = soup.find('div', {'class': 'searchList__product__info__autor'}).find('a').text
except:
author = 'Nic nenalezeno'
try:
link = soup.find('a', {'class': 'btn searchList__product__vendor__bottom__link'})['href']
except:
link = 'Nic nenalezeno'
try:
price = float(soup.find('div', {'class': 'searchList__product__vendor__bottom__price'}).text.replace('Kč', '').strip())
except:
price = 'Nic nenalezeno'
try:
ant_name = soup.find('div', {'class': 'my-md-td searchList__product__vendor'}).find('span').text.replace('\n', '').replace(' ', '').strip()
except:
ant_name = 'Nic nenalezeno'
data.append([title, author, link, price, ant_name])
sheet.append([title, author, link, price, ant_name])
# pro vic vysledku na jeden dotaz use this:
# for item in soup.select('.my-md-row'):
# title = item.find('div', {'class': 'my-md-td searchList__product__info'}).find('h2').text
# link = item.find('a', {'class': 'btn searchList__product__vendor__bottom__link'})['href']
# try:
# price = float(item.find('div', {'class': 'searchList__product__vendor__bottom__price'}).text.replace('Kč', '').strip())
# except:
# price = None
# data.append({
# 'item': {'title': title, 'link': link, 'price': price},
# })
#
print(data)
get_organic_results()
excel.save('seznam_knih.xlsx')