-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathkindle.py
194 lines (165 loc) · 6.42 KB
/
kindle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# buch auswählen, buch löschen oder anzeigen, alle zitate in der richtigen reihenfolge, präsentieren, löschen oder behalten.
# ergebnis speichern in db und exportieren als markdown
# Visualisierung mit notebook
# und pandas
# wie viele notizen
# wie viele verschiedene autoren
# wie viele versch titel
# clipping pro titel
# notizen pro autor
# notizen pro jahr
# histogramm über ganzen zeitraum
# anzahl notizen, anzahl wörter, hist über jahr, monat, woche, tag
# kalender histogramm?
# wörter pro notiz
# max pro buch, autor, tag,, in notizen u wörtern
# topics finden / clustering
import json, sys, os, codecs, re, time
from pathlib import Path
import dateparser
from collections import defaultdict
from typing import Tuple, List, Dict
filename = 'My Clippings.txt'
# date = datetime.datetime.strptime(date, "%d %m %Y - %H:%M:%S")
regex_title = re.compile(r'(.*)\((.*)\)')
def split_title_and_author(line: str) -> tuple[str, None] | tuple[str, str]:
m = regex_title.match(line)
if not m:
print('No Match for author and title split! - will return line as author')
print(line)
return line, None
groups = m.groups()
if len(groups) != 2:
print(groups)
else:
title, author = groups
return title.strip(), author.strip()
def extract_meta_data_from_second_line(line: str) -> Tuple:
meta = line.split('|')
position = None
page = None
timestamp = None
if len(meta) == 3:
page = meta[0].replace('- Ihre Markierung auf Seite ', '')
position = meta[1]
timestamp = meta[2]
elif len(meta) == 2:
if 'Position' in meta[0] or 'Location' in meta[0]:
page = None
position = meta[0].replace('- Ihre Markierung Position ', '').replace('- Your Highlight Location',
'').strip()
elif 'Seite' in meta[0]:
page = meta[0].replace('- Ihre Markierung auf Seite ', '')
position = None
timestamp = meta[1]
else:
print('cannot parse this meta information')
print(meta)
return None
if position:
position = position.replace('Position', '').strip()
position = position.split('-')
else:
position_start = None
position_end = None
if position and len(position) == 2:
position_start, position_end = position
elif position and len(position) == 1:
position_start = position[0]
position_end = None
else:
print('cannot parse this meta')
print(meta)
timestamp = timestamp.replace('Hinzugefügt am ', '').replace('Added on', '')
timestamp = dateparser.parse(timestamp)
if not timestamp:
print(meta)
print(timestamp)
return (page, position_start, position_end, timestamp)
def process_clipping(clipping) -> Dict | None:
# every clipping has a title, content and a date
clipping_dict = {}
clipping = clipping.strip()
if not clipping:
return None
clipping_lines = clipping.split('\r\n')
# print(f'clipping hat {len(clipping_lines)} Zeilen')
if len(clipping_lines) == 2:
# print('skipping bookmark without notes')
return None
if not clipping_lines or len(clipping_lines) < 4:
print('unregelmäßiges clipping:' + clipping)
return None
title, author = split_title_and_author(clipping_lines[0])
if not title or not author:
print(clipping)
return None
page_number, position_start, position_end, timestamp = extract_meta_data_from_second_line(clipping_lines[1])
# TODO date option, nicht nach datum sortieren
content = " ".join(clipping_lines[3:])
# example for a clipping:
# Jane Eyre (Brontë, Charlotte)
# - Ihre Markierung auf Seite 262 | Position 7077-7078 | Hinzugefügt am Sonntag, 27. Mai 2012 um 01:31:13 Uhr
# Reserved people often really need the frank discussion of their sentiments and griefs more than the expansive.
# ==========
clipping_dict = {
'author': author,
'title': title,
'page': page_number,
'position_start': position_start,
'position_end': position_end,
'timestamp': timestamp,
'content': content,
}
# print(clipping_dict)
return clipping_dict
def process_file():
output_path = Path('clippings')
yearly_directories = [output_path.joinpath(str(year)) for year in range(2012, 2023)]
for directory in [output_path, output_path.joinpath('unkown_year')] + yearly_directories:
if not directory.is_dir():
directory.mkdir()
fh = codecs.open(filename, 'r', 'utf-8')
text = fh.read()
# text = text.encode('utf-8')
clippings = text.split('\r\n==========\r\n')
print(f'found {len(clippings)} highlights and notes')
ebooks = defaultdict(list)
clippings.reverse()
for i, c in enumerate(clippings):
if i % 100 == 0:
print(f"Processing clipping nr. {i}")
clipping_dict = process_clipping(c)
if clipping_dict:
ebooks[clipping_dict['title']].append(clipping_dict)
for ebook, clipping_list in ebooks.items():
clipping_list = sorted(clipping_list, key=lambda d: d['position_start'])
first_clipping = clipping_list[0]
if not (first_clipping and first_clipping.get('timestamp')):
print('no timestamp')
print(first_clipping)
continue
timestamp = first_clipping['timestamp']
if not timestamp or not timestamp.year:
print(first_clipping)
year = 'unkown_year'
else:
year = timestamp.year
print(ebook)
ebook = re.sub('[^0-9a-zA-ZäöüßÄÖÜ\-()\s]+', '', ebook)
print(f'...wurde zu {ebook}\n')
markdown_filepath = output_path.joinpath(str(year)).joinpath(ebook + '.md')
fh = codecs.open(markdown_filepath, 'w', 'utf-8')
print(str(markdown_filepath))
fh.write('# ' + first_clipping['title'] + '\n\n')
fh.write('## ' + first_clipping['author'] + '\n\n')
for clipping_dict in clipping_list:
fh.write(clipping_dict['content'] + '\n\n')
fh.close()
# fh = codecs.open('clippings_all.json', 'w', 'utf-8')
# fh.write(json.dumps(list(ebooks.items()), indent=4, sort_keys=True, ensure_ascii=False, default=str))
# fh.close()
if __name__ == '__main__':
process_file()