import re
import sys
import logging
from typing import Optional
__version__ = '0.1.1'
logger = logging.getLogger("doi") # type: logging.Logger
[docs]def pdf_to_doi(filepath: str, maxlines: Optional[int] = None) -> Optional[str]:
"""Try to get DOI from a filepath. It looks for a regex in the binary
data and returns the first DOI found, in the hopes that this DOI
is the correct one.
:param filepath: Path to the pdf file.
:param maxlines: Maximum number of lines that should be checked
For some documents, it could spend a long time trying to look for
a DOI, and DOIs in the middle of documents don't tend to be the correct
DOI of the document.
:returns: DOI or ``None``.
"""
if maxlines is None:
maxlines = sys.maxsize
with open(filepath, 'rb') as fd:
for j, line in enumerate(fd):
doi = find_doi_in_text(line.decode('ascii', errors='ignore'))
if doi:
return doi
if j > maxlines:
return None
return None
[docs]def validate_doi(doi: str) -> Optional[str]:
"""We check that the DOI can be resolved by
`official means <http://www.doi.org/factsheets/DOIProxy.html>`_. If so, we
return the resolved URL, otherwise, we return ``None`` (which means the
DOI is invalid).
:param doi: Identifier.
:returns: The URL assigned to the DOI or ``None``.
"""
from urllib.error import HTTPError
import urllib.request
import urllib.parse
import json
url = "https://doi.org/api/handles/{doi}".format(doi=doi)
logger.debug('handle url %s', url)
request = urllib.request.Request(url)
try:
result = json.loads(urllib.request.urlopen(request).read().decode())
except HTTPError:
raise ValueError('HTTP 404: DOI not found')
else:
urls = [v['data']['value']
for v in result['values'] if v.get('type') == 'URL']
return urls[0] if urls else None
[docs]def get_clean_doi(doi: str) -> str:
"""Check if the DOI is actually a URL and in that case just get
the exact DOI.
:param doi: String containing a DOI.
:returns: The extracted DOI.
"""
doi = re.sub(r'%2F', '/', doi)
# For pdfs
doi = re.sub(r'\)>', ' ', doi)
doi = re.sub(r'\)/S/URI', ' ', doi)
doi = re.sub(r'(/abstract)', '', doi)
doi = re.sub(r'\)$', '', doi)
return doi
[docs]def find_doi_in_text(text: str) -> Optional[str]:
"""Try to find a DOI in a text.
:param text: Text in which to look for DOI.
:returns: A DOI, if found, otherwise ``None``.
"""
text = get_clean_doi(text)
forbidden_doi_characters = r'"\s%$^\'<>@,;:#?&'
# Sometimes it is in the javascript defined
var_doi = re.compile(
r'doi(.org)?'
r'\s*(=|:|/|\()\s*'
r'("|\')?'
r'(?P<doi>[^{fc}]+)'
r'("|\'|\))?'
.format(
fc=forbidden_doi_characters
), re.I
)
for regex in [var_doi]:
miter = regex.finditer(text)
try:
m = next(miter)
if m:
doi = m.group('doi')
return get_clean_doi(doi)
except StopIteration:
pass
return None
[docs]def get_real_url_from_doi(doi: str) -> Optional[str]:
"""Get a URL corresponding to a DOI.
:param doi: Identifier.
:returns: A URL for the DOI. If the DOI is invalid, return ``None``.
"""
url = validate_doi(doi)
if url is None:
return url
m = re.match(r'.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*', url, re.I)
if m:
return ('https://www.sciencedirect.com/science/article/abs/pii/{pii}'
.format(pii=m.group(1)))
return url