-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMathGenealogy.py
78 lines (63 loc) · 2.59 KB
/
MathGenealogy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# -*- coding: utf-8 -*-
"""
MathGenealogy.py
A class to visualize the Mathematics Genealogy Project
(https://www.genealogy.math.ndsu.nodak.edu)
Scrap pages containing information of each person,
get the direct supervisors and draw a graph.
---
USAGE:
# import the class
from MathGenealogy import MathGenealogy
# create instance
MG = MathGenealogy()
# execute the method
MG.find_ancestors(<PersonID>,<PersonName>)
# PersonID can be found in the website (https://www.genealogy.math.ndsu.nodak.edu).
# Currently not so convenient. Will be changed soon.
"""
import pydot
import requests
from bs4 import BeautifulSoup
class MathGenealogy:
baseURL = "https://www.genealogy.math.ndsu.nodak.edu/id.php?id="
# Main function to retrieve the ancestors data
def find_ancestors(self,personID,personName):
self._dictPair = []
self._dictName = []
# _dictName contains the ID and the name
self._dictName.append([personID,personName])
# Recursive function to get the ancestors
self.parse_ancestors(personID)
self._dictName = [list(t) for t in set(tuple(element) for element in self._dictName)]
self._dictId = [i[0] for i in self._dictName]
# _dictPair contains person ID and one ID of his/her ancestor
self._dictPair = [list(t) for t in set(tuple(element) for element in self._dictPair)]
dictPairName = []
for d in self._dictPair:
x = self._dictName[self._dictId.index(d[0])][1]
y = self._dictName[self._dictId.index(d[1])][1]
dictPairName.append([x,y])
self.draw_ancestors(personID,dictPairName)
# A function to recursively extract the name of direct supervisors for each person
def parse_ancestors(self,idAncestor):
getData = requests.get(self.baseURL + idAncestor)
parseRaw = BeautifulSoup(getData.text,"html.parser")
ancData = parseRaw.select("#paddingWrapper > p")[1].findAll("a")
if ancData != None:
for ancDatum in ancData:
ancId = ancDatum["href"].split("=")[1]
ancName = ancDatum.string
self._dictName.append([ancId,ancName])
self._dictPair.append([ancId,idAncestor])
self.parse_ancestors(ancId)
# Draw a digraph
def draw_ancestors(self,fileName,pairName):
graph = pydot.Dot(graph_type='digraph')
for edge in pairName:
nodea = pydot.Node(edge[0])
nodeb = pydot.Node(edge[1])
graph.add_node(nodea)
graph.add_node(nodeb)
graph.add_edge(pydot.Edge(nodea,nodeb))
graph.write(fileName + ".png",format="png")