diff --git a/DuBibtex.py b/DuBibtex.py index 2f44a8d..c717409 100644 --- a/DuBibtex.py +++ b/DuBibtex.py @@ -45,30 +45,30 @@ def request_url(url): class Re: - bib = re.compile('\s*\@(\w+)\s*\{\s*([^\{\,\}]+),') - item = re.compile('\s*(\w+)\s*=\s*[\{"]\s*(.*)\s*[\}"]') - item2 = re.compile('\s*(\w+)\s*=\s*[\{"]\{\s*(.*)\s*[\}"]\}') - endl = re.compile('\s*}\s*') - abbr = re.compile('@string', flags=re.IGNORECASE) - doiJson = re.compile('doi\.org\\?\/([\w\d\.\-\\\/]+)', flags=re.MULTILINE) - doiUrl = re.compile('doi\.org\/([\w\d\.\-\\\/]+)', flags=re.MULTILINE) + bib = re.compile(r'\s*\@(\w+)\s*\{\s*([^\{\,\}]+),') + item = re.compile(r'\s*(\w+)\s*=\s*[\{"]\s*(.*)\s*[\}"]') + item2 = re.compile(r'\s*(\w+)\s*=\s*[\{"]\{\s*(.*)\s*[\}"]\}') + endl = re.compile(r'\s*}\s*') + abbr = re.compile(r'@string', flags=re.IGNORECASE) + doiJson = re.compile(r'doi\.org\\?\/([\w\d\.\-\\\/]+)', flags=re.MULTILINE) + doiUrl = re.compile(r'rdoi\.org\/([\w\d\.\-\\\/]+)', flags=re.MULTILINE) doiAcmUrl = re.compile( - 'https:\/\/dl\.acm\.org\/doi\/(?:\w+\/)?([\w\d\.\-\\\/]+)', + r'https:\/\/dl\.acm\.org\/doi\/(?:\w+\/)?([\w\d\.\-\\\/]+)', flags=re.MULTILINE) - doiJavascript = re.compile('doi\"\:\"([\w\d\.\-\\\/]+)\"', flags=re.MULTILINE) - doiText = re.compile('"DOI":"([\w\.\\\/]*)"', flags=re.MULTILINE) - doiSpringer = re.compile('chapter\/([\w\.\\\/\_\-]+)', flags=re.MULTILINE) - doiWiley = re.compile('doi\/abs\/([\w\.\\\/\_\-]+)', flags=re.MULTILINE) - doiCaltech = re.compile('authors\.library\.caltech\.edu\/(\d+)', + doiJavascript = re.compile(r'doi\"\:\"([\w\d\.\-\\\/]+)\"', flags=re.MULTILINE) + doiText = re.compile(r'"DOI":"([\w\.\\\/]*)"', flags=re.MULTILINE) + doiSpringer = re.compile(r'chapter\/([\w\.\\\/\_\-]+)', flags=re.MULTILINE) + doiWiley = re.compile(r'doi\/abs\/([\w\.\\\/\_\-]+)', flags=re.MULTILINE) + doiCaltech = re.compile(r'authors\.library\.caltech\.edu\/(\d+)', flags=re.MULTILINE) - doiPubmed = re.compile('nlm\.nih\.gov\/pubmed\/(\d+)', flags=re.MULTILINE) - urlArxiv = re.compile('arxiv\.org\/pdf\/([\d\.]+)', flags=re.MULTILINE) - acm = re.compile('citation\.cfm\?id\=([\d\.]+)', flags=re.MULTILINE) - acmBib = re.compile('
(.+)<\/pre>', + doiPubmed = re.compile(r'nlm\.nih\.gov\/pubmed\/(\d+)', flags=re.MULTILINE) + urlArxiv = re.compile(r'arxiv\.org\/pdf\/([\d\.]+)', flags=re.MULTILINE) + acm = re.compile(r'citation\.cfm\?id\=([\d\.]+)', flags=re.MULTILINE) + acmBib = re.compile(r'(.+)<\/pre>', flags=re.MULTILINE | re.IGNORECASE | re.S) - ieee = re.compile('ieee\.org(?:\/abstract)?\/document\/(\d+)', flags=re.MULTILINE) + ieee = re.compile(r'ieee\.org(?:\/abstract)?\/document\/(\d+)', flags=re.MULTILINE) neurips = re.compile(r'proceedings.neurips.cc', flags=re.MULTILINE) - year = re.compile('\w+(\d+)') + year = re.compile(r'\w+(\d+)') class Parser: @@ -320,9 +320,10 @@ def crossref_lookup(_title): if m and len(m.groups()) > 0: res = m.groups()[0] res = res.replace('\\', '') - if Paras.debugBibCrawler: - print("DOI from CrossRef Lookup: %s\n" % res) - return res + if 'policy' not in res: + if Paras.debugBibCrawler: + print("DOI from CrossRef Lookup: %s\n" % res) + return res return None @@ -522,7 +523,7 @@ def google_lookup(s, parser): def fix_underscore(s): - return re.sub('[^\_]\_', '\\\_', s) + return re.sub(r'[^\_]\_', r'\\\_', s) def fix_abs_pdf(s):