Source code for doi

import re
import sys
import logging

from typing import Optional


__version__ = '0.1.1'
logger = logging.getLogger("doi")   # type: logging.Logger


[docs]def pdf_to_doi(filepath: str, maxlines: Optional[int] = None) -> Optional[str]: """Try to get DOI from a filepath. It looks for a regex in the binary data and returns the first DOI found, in the hopes that this DOI is the correct one. :param filepath: Path to the pdf file. :param maxlines: Maximum number of lines that should be checked For some documents, it could spend a long time trying to look for a DOI, and DOIs in the middle of documents don't tend to be the correct DOI of the document. :returns: DOI or ``None``. """ if maxlines is None: maxlines = sys.maxsize with open(filepath, 'rb') as fd: for j, line in enumerate(fd): doi = find_doi_in_text(line.decode('ascii', errors='ignore')) if doi: return doi if j > maxlines: return None return None
[docs]def validate_doi(doi: str) -> Optional[str]: """We check that the DOI can be resolved by `official means <http://www.doi.org/factsheets/DOIProxy.html>`_. If so, we return the resolved URL, otherwise, we return ``None`` (which means the DOI is invalid). :param doi: Identifier. :returns: The URL assigned to the DOI or ``None``. """ from urllib.error import HTTPError import urllib.request import urllib.parse import json url = "https://doi.org/api/handles/{doi}".format(doi=doi) logger.debug('handle url %s', url) request = urllib.request.Request(url) try: result = json.loads(urllib.request.urlopen(request).read().decode()) except HTTPError: raise ValueError('HTTP 404: DOI not found') else: urls = [v['data']['value'] for v in result['values'] if v.get('type') == 'URL'] return urls[0] if urls else None
[docs]def get_clean_doi(doi: str) -> str: """Check if the DOI is actually a URL and in that case just get the exact DOI. :param doi: String containing a DOI. :returns: The extracted DOI. """ doi = re.sub(r'%2F', '/', doi) # For pdfs doi = re.sub(r'\)>', ' ', doi) doi = re.sub(r'\)/S/URI', ' ', doi) doi = re.sub(r'(/abstract)', '', doi) doi = re.sub(r'\)$', '', doi) return doi
[docs]def find_doi_in_text(text: str) -> Optional[str]: """Try to find a DOI in a text. :param text: Text in which to look for DOI. :returns: A DOI, if found, otherwise ``None``. """ text = get_clean_doi(text) forbidden_doi_characters = r'"\s%$^\'<>@,;:#?&' # Sometimes it is in the javascript defined var_doi = re.compile( r'doi(.org)?' r'\s*(=|:|/|\()\s*' r'("|\')?' r'(?P<doi>[^{fc}]+)' r'("|\'|\))?' .format( fc=forbidden_doi_characters ), re.I ) for regex in [var_doi]: miter = regex.finditer(text) try: m = next(miter) if m: doi = m.group('doi') return get_clean_doi(doi) except StopIteration: pass return None
[docs]def get_real_url_from_doi(doi: str) -> Optional[str]: """Get a URL corresponding to a DOI. :param doi: Identifier. :returns: A URL for the DOI. If the DOI is invalid, return ``None``. """ url = validate_doi(doi) if url is None: return url m = re.match(r'.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*', url, re.I) if m: return ('https://www.sciencedirect.com/science/article/abs/pii/{pii}' .format(pii=m.group(1))) return url