Skip to content

Commit

Permalink
Merge branch 'internetarchive:master' into 10196/refactor/enable-ruff…
Browse files Browse the repository at this point in the history
…-rule-SIM115
  • Loading branch information
techy4shri authored Jan 18, 2025
2 parents 6163276 + 3e18ea8 commit caf898b
Show file tree
Hide file tree
Showing 19 changed files with 5,568 additions and 2,519 deletions.
18 changes: 3 additions & 15 deletions docker/Dockerfile.olbase
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,10 @@ RUN apt-get -qq update && apt-get install -y \
COPY scripts/install_nodejs.sh ./
RUN ./install_nodejs.sh && rm ./install_nodejs.sh

# Install Archive.org nginx w/ IP anonymization
# Install nginx
USER root
RUN apt-get update && apt-get install -y --no-install-recommends nginx curl letsencrypt \
# nginx-plus
apt-transport-https lsb-release ca-certificates wget \
# log rotation service for ol-nginx
logrotate \
# rsync service for pulling monthly sitemaps from ol-home0 to ol-www0
rsync
COPY scripts/install_openresty.sh ./
RUN ./install_openresty.sh && rm ./install_openresty.sh
RUN rm /usr/sbin/nginx
RUN curl -L https://archive.org/download/nginx/nginx -o /usr/sbin/nginx
RUN chmod +x /usr/sbin/nginx
# Remove the stock nginx config file
RUN rm /etc/nginx/sites-enabled/default
COPY scripts/install_nginx.sh ./
RUN ./install_nginx.sh && rm ./install_nginx.sh

RUN mkdir -p /var/log/openlibrary /var/lib/openlibrary && chown openlibrary:openlibrary /var/log/openlibrary /var/lib/openlibrary \
&& mkdir /openlibrary && chown openlibrary:openlibrary /openlibrary \
Expand Down
3 changes: 3 additions & 0 deletions docker/covers_nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ server {
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers EECDH+CHACHA20:EECDH+AES128:RSA+AES128:EECDH+AES256:RSA+AES256:EECDH+3DES:RSA+3DES:!MD5;
ssl_prefer_server_ciphers on;

# Needed for logging/IP anonymization
include /olsystem/etc/nginx/logging_periodics.conf;
}

# Docker's internal load balancing ends up with unbalanced connections eventually.
Expand Down
6 changes: 5 additions & 1 deletion docker/nginx.conf
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Needed for IP anonymization
load_module modules/ngx_http_js_module.so;

user www-data;

# XXX-Anand: Oct 2013
Expand Down Expand Up @@ -25,7 +28,8 @@ http {
server_names_hash_bucket_size 64;
types_hash_bucket_size 64;

log_format iacombined '$remote_addr_ipscrub $host $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $request_time';
# Logging / IP Anonymization; also need logging_periodics.conf inside a server block
include /olsystem/etc/nginx/logging.conf;
access_log /var/log/nginx/access.log iacombined;

client_max_body_size 50m;
Expand Down
3 changes: 3 additions & 0 deletions docker/web_nginx.conf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ server {
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers EECDH+CHACHA20:EECDH+AES128:RSA+AES128:EECDH+AES256:RSA+AES256:EECDH+3DES:RSA+3DES:!MD5;
ssl_prefer_server_ciphers on;

# Needed for logging/IP anonymization
include /olsystem/etc/nginx/logging_periodics.conf;
}

server {
Expand Down
37 changes: 24 additions & 13 deletions openlibrary/catalog/add_book/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,15 @@
from infogami import config
from openlibrary import accounts
from openlibrary.catalog.add_book.load_book import (
InvalidLanguage,
build_query,
east_in_by_statement,
import_author,
)
from openlibrary.catalog.add_book.match import editions_match, mk_norm
from openlibrary.catalog.utils import (
EARLIEST_PUBLISH_YEAR_FOR_BOOKSELLERS,
InvalidLanguage,
format_languages,
get_non_isbn_asin,
get_publication_year,
is_independently_published,
Expand Down Expand Up @@ -818,20 +819,30 @@ def update_edition_with_rec_data(
'lc_classifications',
'oclc_numbers',
'source_records',
'languages',
]
for f in edition_list_fields:
if f not in rec or not rec[f]:
edition_dict: dict = edition.dict()
for field in edition_list_fields:
if field not in rec:
continue
# ensure values is a list
values = rec[f] if isinstance(rec[f], list) else [rec[f]]
if f in edition:
# get values from rec field that are not currently on the edition
case_folded_values = {v.casefold() for v in edition[f]}
to_add = [v for v in values if v.casefold() not in case_folded_values]
edition[f] += to_add

existing_values = edition_dict.get(field, []) or []
rec_values = rec.get(field, [])

# Languages in `rec` are ['eng'], etc., but import requires dict-style.
if field == 'languages':
formatted_languages = format_languages(languages=rec_values)
supplemented_values = existing_values + [
lang for lang in formatted_languages if lang not in existing_values
]
else:
edition[f] = to_add = values
if to_add:
case_folded_values = [v.casefold() for v in existing_values]
supplemented_values = existing_values + [
v for v in rec_values if v.casefold() not in case_folded_values
]

if existing_values != supplemented_values:
edition[field] = supplemented_values
need_edition_save = True

# Fields that are added as a whole if absent. (Individual values are not added.)
Expand All @@ -848,7 +859,7 @@ def update_edition_with_rec_data(
edition[f] = rec[f]
need_edition_save = True

# Add new identifiers
# Add new identifiers (dict values, so different treatment from lists above.)
if 'identifiers' in rec:
identifiers = defaultdict(list, edition.dict().get('identifiers', {}))
for k, vals in rec['identifiers'].items():
Expand Down
23 changes: 10 additions & 13 deletions openlibrary/catalog/add_book/load_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import web

from openlibrary.catalog.utils import author_dates_match, flip_name, key_int
from openlibrary.catalog.utils import (
author_dates_match,
flip_name,
format_languages,
key_int,
)
from openlibrary.core.helpers import extract_year

if TYPE_CHECKING:
Expand Down Expand Up @@ -254,14 +259,6 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
return a


class InvalidLanguage(Exception):
def __init__(self, code):
self.code = code

def __str__(self):
return f"invalid language code: '{self.code}'"


type_map = {'description': 'text', 'notes': 'text', 'number_of_pages': 'int'}


Expand All @@ -283,12 +280,12 @@ def build_query(rec: dict[str, Any]) -> dict[str, Any]:
east = east_in_by_statement(rec, author)
book['authors'].append(import_author(author, eastern=east))
continue

if k in ('languages', 'translated_from'):
for language in v:
if web.ctx.site.get('/languages/' + language.lower()) is None:
raise InvalidLanguage(language.lower())
book[k] = [{'key': '/languages/' + language.lower()} for language in v]
formatted_languages = format_languages(languages=v)
book[k] = formatted_languages
continue

if k in type_map:
t = '/type/' + type_map[k]
if isinstance(v, list):
Expand Down
115 changes: 113 additions & 2 deletions openlibrary/catalog/add_book/tests/test_add_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,10 @@ def test_load_with_redirected_author(mock_site, add_languages):


def test_duplicate_ia_book(mock_site, add_languages, ia_writeback):
"""
Here all fields that are 'used' (i.e. read and contribute to the edition)
are the same.
"""
rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
Expand All @@ -312,16 +316,123 @@ def test_duplicate_ia_book(mock_site, add_languages, ia_writeback):
assert e.type.key == '/type/edition'
assert e.source_records == ['ia:test_item']

matching_rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
# Titles MUST match to be considered the same
'title': 'Test item',
'languages': ['eng'],
}
reply = load(matching_rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'matched'


def test_matched_edition_with_new_language_in_rec_adds_language(
mock_site, add_languages, ia_writeback
):
"""
When records match, but the record has a new language, the new language
should be added to the existing edition, but existing languages should
not be duplicated.
"""
rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
'title': 'Test item',
'languages': ['eng'],
}
reply = load(rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'created'
e = mock_site.get(reply['edition']['key'])
assert e.type.key == '/type/edition'
assert e.source_records == ['ia:test_item']

matching_rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
# Titles MUST match to be considered the same
'title': 'Test item',
'languages': ['fre', 'eng'],
}
reply = load(matching_rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'modified'
updated_e = mock_site.get(reply['edition']['key'])
updated_languages = [lang['key'] for lang in updated_e.languages]
assert updated_languages == ['/languages/eng', '/languages/fre']


def test_matched_edition_with_new_language_is_added_even_if_no_existing_language(
mock_site, add_languages, ia_writeback
):
"""
Ensure a new language is added even if the existing edition has no language
field.
"""
rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
'title': 'Test item',
}
reply = load(rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'created'
e = mock_site.get(reply['edition']['key'])
assert e.type.key == '/type/edition'
assert e.source_records == ['ia:test_item']

matching_rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
# Titles MUST match to be considered the same
'title': 'Test item',
'languages': ['fre'],
'languages': ['eng'],
}
reply = load(matching_rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'modified'
updated_edition = mock_site.get(reply['edition']['key'])
updated_languages = [lang['key'] for lang in updated_edition.languages]
assert updated_languages == ['/languages/eng']


def test_matched_edition_properly_updates_non_language_fields(
mock_site, add_languages, ia_writeback
):
"""
Ensure a new language is added even if the existing edition has no language
field.
"""
rec = {
'ocaid': 'test_item',
'source_records': ['ia:test_item'],
'title': 'Test item',
}
reply = load(rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'matched'
assert reply['edition']['status'] == 'created'
e = mock_site.get(reply['edition']['key'])
assert e.type.key == '/type/edition'
assert e.source_records == ['ia:test_item']

matching_rec = {
'ocaid': 'test_item',
'source_records': ['test:1234567890'], # updated existing field in edition.
'title': 'Test item',
'lc_classifications': ['PQ2671.A58'], # new field not present in edition.
}
reply = load(matching_rec)
assert reply['success'] is True
assert reply['edition']['status'] == 'modified'
updated_edition = mock_site.get(reply['edition']['key'])

expected_source_records = ['ia:test_item', 'test:1234567890']
expected_lc_classifications = ['PQ2671.A58']

assert expected_source_records == updated_edition.source_records
assert expected_lc_classifications == updated_edition.lc_classifications


class Test_From_MARC:
Expand Down
2 changes: 1 addition & 1 deletion openlibrary/catalog/add_book/tests/test_load_book.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

from openlibrary.catalog.add_book import load_book
from openlibrary.catalog.add_book.load_book import (
InvalidLanguage,
build_query,
find_entity,
import_author,
remove_author_honorifics,
)
from openlibrary.catalog.utils import InvalidLanguage
from openlibrary.core.models import Author


Expand Down
28 changes: 28 additions & 0 deletions openlibrary/catalog/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import re
from collections.abc import Iterable
from typing import TYPE_CHECKING
from unicodedata import normalize

Expand Down Expand Up @@ -434,3 +435,30 @@ def get_missing_fields(rec: dict) -> list[str]:
'source_records',
]
return [field for field in required_fields if rec.get(field) is None]


class InvalidLanguage(Exception):
def __init__(self, code):
self.code = code

def __str__(self):
return f"invalid language code: '{self.code}'"


def format_languages(languages: Iterable) -> list[dict[str, str]]:
"""
Format language data to match Open Library's expected format.
For an input of ["eng", "fre"], return:
[{'key': '/languages/eng'}, {'key': '/languages/fre'}]
"""
if not languages:
return []

formatted_languages = []
for language in languages:
if web.ctx.site.get(f"/languages/{language.lower()}") is None:
raise InvalidLanguage(language.lower())

formatted_languages.append({'key': f'/languages/{language.lower()}'})

return formatted_languages
2 changes: 1 addition & 1 deletion openlibrary/core/lists/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def get_export_list(self) -> dict[str, list[dict]]:
things = cast(
list[Thing],
web.ctx.site.get_many(
[seed.key for seed in self.seeds if isinstance(seed, Thing)]
[seed.key for seed in self.get_seeds() if seed._type != "subject"]
),
)

Expand Down
Loading

0 comments on commit caf898b

Please sign in to comment.