-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextract.js
158 lines (157 loc) · 4.84 KB
/
extract.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/* SCRIPT TO BE ADDED TO A LIBREOFFICE DOC EXPORTED AS HTML TO CLEAN DOM & EXTRACT CONTENT AS JSON */
let getText = (node) => (node.innerText || node.textContent || node);
let normalize = (text) => {
text = text.replace(/(^\s+|\s+$)/g,"");
text = text.replace(/\s{2,}/g," ");
return text;
};
let findAuthor = (element) => {
let author = element.parentElement.lastChild;
if (author == element) {
author = element.parentElement.nextSibling;
}
try {
author = getText(author);
} catch {
console.log(author);
}
return author;
};
let parseAuthor = (text) => {
let separator = ["&"," et ","\n"].find(e => text.indexOf(e) >= 0);
text = text.split(separator).map(authorName => {
authorName = normalize(authorName);
if (authorName.slice(0,3) == "de ") {
authorName = authorName.slice(3);
}
return authorName;
});
return text;
};
let findCompleteTextNode = (element,selector,text=[]) => {
text.push(element);
if ((element.nextSibling) && (element.nextSibling.nodeName == selector)) {
return findCompleteTextNode(element.nextSibling,selector,text);
} else {
return normalize(text.map(e => getText(e)).join(""));
}
};
let findSummary = (element) => {
let nextSibling = element.nextElementSibling;
if (nextSibling === null) {
return findSummary(element.parentElement);
} else {
let summary = nextSibling.querySelector("i");
if (summary === null) {
return findSummary(element.parentElement);
} else {
return summary;
}
}
}
let completeSummary = (element,summary=[]) => {
try {
if (element.nextElementSibling.querySelector("i") == null) {
return summary;
} else {
summary.push(normalize(getText(element.nextElementSibling.querySelector("i"))));
return completeSummary(element.nextElementSibling,summary);
}
} catch {
return summary;
}
}
let extractTags = (node,tags={}) => {
//facultatif: h2
//obligatoire: h1
if (node.previousSibling.nodeName == "H1") {
tags.h1 = normalize(getText(node.previousSibling));
return tags;
} else if (node.previousSibling.nodeName == "H2") {
if (typeof tags.h2 == "undefined") {
tags.h2 = normalize(getText(node.previousSibling));
}
return extractTags(node.previousSibling,tags);
} else {
return extractTags(node.previousSibling,tags);
}
}
let extractContent = () => {
let titles = [...document.querySelectorAll("u:first-child")];
titles = titles.map(titleNode => {
let titre = findCompleteTextNode(titleNode,"U");
let resume = [];
let tags = {};
resume = completeSummary(titleNode.parentElement).join("\n");
if ((resume == "") || (resume == "\n")) {
console.error("failed finding summary on title "+titre);
}
let auteurs = parseAuthor(findAuthor(titleNode));
if (auteurs[0] == titre) {
console.error("failed finding author on title "+titre);
}
tags = extractTags(titleNode.parentElement);
if (tags === {}) {
console.error("failed finding tags on title "+titre);
}
return {
titre,
resume,
auteurs,
tags
}
});
return titles;
};
let cleanDoc = () => {
let newDoc = document.body.cloneNode();
let topLevel = [...document.querySelector("body").children];
topLevel.filter(e => e.nodeName !== "SCRIPT").map(node => {
let nodeName = node.nodeName;
if ((node.querySelector("i") === null)
&& (node.querySelector("u") === null)) {
if (normalize(getText(node)) !== "") {
let newElem = document.createElement(nodeName);
newElem.innerText = normalize(getText(node));
newDoc.append(newElem);
}
} else {
["i","u"].map(e => {
let elements = [...node.querySelectorAll(e)];
let styledText = "";
let unstyledText = "";
if (elements.length > 0) {
styledText = normalize(elements.map(e => getText(e)).join(""));
unstyledText = normalize(getText(node)).slice(styledText.length);
let p = document.createElement("p");
if ((styledText != "") && (styledText != '”')) {
let newElem = document.createElement(e);
newElem.innerText = styledText;
p.append(newElem);
}
if ((unstyledText != "") && (unstyledText != '”')) {
let textNode = document.createTextNode(unstyledText);
p.append(textNode);
}
if (normalize(getText(p)) != "") {
newDoc.append(p);
}
}
})
}
});
document.body.innerHTML = newDoc.innerHTML;
}
cleanDoc();
data = extractContent();
ean = {
"Un tigre dans mon jardin":"9782919181025",
"Panic city":"9782919181063",
"Trafic Ocean":"9782919181032",
"Un tout petit point":"9782919181087"
}
Object.keys(ean).map(e => {
data.find(d => d.titre == e).ean = ean[e];
});
document.body.innerHTML = JSON.stringify(data);
//console.log(extractContent());