Merge branch 'internetarchive:master' into 10196/refactor/enable-ruff…

…-rule-SIM115
internetarchive · Jan 18, 2025 · caf898b · caf898b
2 parents 6163276 + 3e18ea8
commit caf898b
Show file tree

Hide file tree

Showing 19 changed files with 5,568 additions and 2,519 deletions.
diff --git a/docker/Dockerfile.olbase b/docker/Dockerfile.olbase
@@ -38,22 +38,10 @@ RUN apt-get -qq update && apt-get install -y \
 COPY scripts/install_nodejs.sh ./
 RUN ./install_nodejs.sh && rm ./install_nodejs.sh
 
-# Install Archive.org nginx w/ IP anonymization
+# Install nginx
 USER root
-RUN apt-get update && apt-get install -y --no-install-recommends nginx curl letsencrypt \
-    # nginx-plus
-    apt-transport-https lsb-release ca-certificates wget \
-    # log rotation service for ol-nginx
-    logrotate \
-    # rsync service for pulling monthly sitemaps from ol-home0 to ol-www0
-    rsync
-COPY scripts/install_openresty.sh ./
-RUN ./install_openresty.sh && rm ./install_openresty.sh
-RUN rm /usr/sbin/nginx
-RUN curl -L https://archive.org/download/nginx/nginx -o /usr/sbin/nginx
-RUN chmod +x /usr/sbin/nginx
-# Remove the stock nginx config file
-RUN rm /etc/nginx/sites-enabled/default
+COPY scripts/install_nginx.sh ./
+RUN ./install_nginx.sh && rm ./install_nginx.sh
 
 RUN mkdir -p /var/log/openlibrary /var/lib/openlibrary && chown openlibrary:openlibrary /var/log/openlibrary /var/lib/openlibrary \
  && mkdir /openlibrary && chown openlibrary:openlibrary /openlibrary \

diff --git a/docker/covers_nginx.conf b/docker/covers_nginx.conf
@@ -14,6 +14,9 @@ server {
     ssl_protocols TLSv1.2 TLSv1.3;
     ssl_ciphers EECDH+CHACHA20:EECDH+AES128:RSA+AES128:EECDH+AES256:RSA+AES256:EECDH+3DES:RSA+3DES:!MD5;
     ssl_prefer_server_ciphers on;
+
+    # Needed for logging/IP anonymization
+    include /olsystem/etc/nginx/logging_periodics.conf;
 }
 
 # Docker's internal load balancing ends up with unbalanced connections eventually.

diff --git a/docker/nginx.conf b/docker/nginx.conf
@@ -1,3 +1,6 @@
+# Needed for IP anonymization
+load_module modules/ngx_http_js_module.so;
+
 user  www-data;
 
 # XXX-Anand: Oct 2013
@@ -25,7 +28,8 @@ http {
     server_names_hash_bucket_size   64;
     types_hash_bucket_size 64;
 
-    log_format iacombined '$remote_addr_ipscrub $host $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $request_time';
+    # Logging / IP Anonymization; also need logging_periodics.conf inside a server block
+    include /olsystem/etc/nginx/logging.conf;
     access_log    /var/log/nginx/access.log iacombined;
 
     client_max_body_size 50m;

diff --git a/docker/web_nginx.conf b/docker/web_nginx.conf
@@ -32,6 +32,9 @@ server {
     ssl_protocols TLSv1.2 TLSv1.3;
     ssl_ciphers EECDH+CHACHA20:EECDH+AES128:RSA+AES128:EECDH+AES256:RSA+AES256:EECDH+3DES:RSA+3DES:!MD5;
     ssl_prefer_server_ciphers on;
+
+    # Needed for logging/IP anonymization
+    include /olsystem/etc/nginx/logging_periodics.conf;
 }
 
 server {

diff --git a/openlibrary/catalog/add_book/__init__.py b/openlibrary/catalog/add_book/__init__.py
@@ -36,14 +36,15 @@
 from infogami import config
 from openlibrary import accounts
 from openlibrary.catalog.add_book.load_book import (
-    InvalidLanguage,
     build_query,
     east_in_by_statement,
     import_author,
 )
 from openlibrary.catalog.add_book.match import editions_match, mk_norm
 from openlibrary.catalog.utils import (
     EARLIEST_PUBLISH_YEAR_FOR_BOOKSELLERS,
+    InvalidLanguage,
+    format_languages,
     get_non_isbn_asin,
     get_publication_year,
     is_independently_published,
@@ -818,20 +819,30 @@ def update_edition_with_rec_data(
         'lc_classifications',
         'oclc_numbers',
         'source_records',
+        'languages',
     ]
-    for f in edition_list_fields:
-        if f not in rec or not rec[f]:
+    edition_dict: dict = edition.dict()
+    for field in edition_list_fields:
+        if field not in rec:
             continue
-        # ensure values is a list
-        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
-        if f in edition:
-            # get values from rec field that are not currently on the edition
-            case_folded_values = {v.casefold() for v in edition[f]}
-            to_add = [v for v in values if v.casefold() not in case_folded_values]
-            edition[f] += to_add
+
+        existing_values = edition_dict.get(field, []) or []
+        rec_values = rec.get(field, [])
+
+        # Languages in `rec` are ['eng'], etc., but import requires dict-style.
+        if field == 'languages':
+            formatted_languages = format_languages(languages=rec_values)
+            supplemented_values = existing_values + [
+                lang for lang in formatted_languages if lang not in existing_values
+            ]
         else:
-            edition[f] = to_add = values
-        if to_add:
+            case_folded_values = [v.casefold() for v in existing_values]
+            supplemented_values = existing_values + [
+                v for v in rec_values if v.casefold() not in case_folded_values
+            ]
+
+        if existing_values != supplemented_values:
+            edition[field] = supplemented_values
             need_edition_save = True
 
     # Fields that are added as a whole if absent. (Individual values are not added.)
@@ -848,7 +859,7 @@ def update_edition_with_rec_data(
             edition[f] = rec[f]
             need_edition_save = True
 
-    # Add new identifiers
+    # Add new identifiers (dict values, so different treatment from lists above.)
     if 'identifiers' in rec:
         identifiers = defaultdict(list, edition.dict().get('identifiers', {}))
         for k, vals in rec['identifiers'].items():

diff --git a/openlibrary/catalog/add_book/load_book.py b/openlibrary/catalog/add_book/load_book.py
@@ -2,7 +2,12 @@
 
 import web
 
-from openlibrary.catalog.utils import author_dates_match, flip_name, key_int
+from openlibrary.catalog.utils import (
+    author_dates_match,
+    flip_name,
+    format_languages,
+    key_int,
+)
 from openlibrary.core.helpers import extract_year
 
 if TYPE_CHECKING:
@@ -254,14 +259,6 @@ def import_author(author: dict[str, Any], eastern=False) -> "Author | dict[str,
     return a
 
 
-class InvalidLanguage(Exception):
-    def __init__(self, code):
-        self.code = code
-
-    def __str__(self):
-        return f"invalid language code: '{self.code}'"
-
-
 type_map = {'description': 'text', 'notes': 'text', 'number_of_pages': 'int'}
 
 
@@ -283,12 +280,12 @@ def build_query(rec: dict[str, Any]) -> dict[str, Any]:
                     east = east_in_by_statement(rec, author)
                     book['authors'].append(import_author(author, eastern=east))
             continue
+
         if k in ('languages', 'translated_from'):
-            for language in v:
-                if web.ctx.site.get('/languages/' + language.lower()) is None:
-                    raise InvalidLanguage(language.lower())
-            book[k] = [{'key': '/languages/' + language.lower()} for language in v]
+            formatted_languages = format_languages(languages=v)
+            book[k] = formatted_languages
             continue
+
         if k in type_map:
             t = '/type/' + type_map[k]
             if isinstance(v, list):

diff --git a/openlibrary/catalog/add_book/tests/test_add_book.py b/openlibrary/catalog/add_book/tests/test_add_book.py
@@ -299,6 +299,10 @@ def test_load_with_redirected_author(mock_site, add_languages):
 
 
 def test_duplicate_ia_book(mock_site, add_languages, ia_writeback):
+    """
+    Here all fields that are 'used' (i.e. read and contribute to the edition)
+    are the same.
+    """
     rec = {
         'ocaid': 'test_item',
         'source_records': ['ia:test_item'],
@@ -312,16 +316,123 @@ def test_duplicate_ia_book(mock_site, add_languages, ia_writeback):
     assert e.type.key == '/type/edition'
     assert e.source_records == ['ia:test_item']
 
+    matching_rec = {
+        'ocaid': 'test_item',
+        'source_records': ['ia:test_item'],
+        # Titles MUST match to be considered the same
+        'title': 'Test item',
+        'languages': ['eng'],
+    }
+    reply = load(matching_rec)
+    assert reply['success'] is True
+    assert reply['edition']['status'] == 'matched'
+
+
+def test_matched_edition_with_new_language_in_rec_adds_language(
+    mock_site, add_languages, ia_writeback
+):
+    """
+    When records match, but the record has a new language, the new language
+    should be added to the existing edition, but existing languages should
+    not be duplicated.
+    """
+    rec = {
+        'ocaid': 'test_item',
+        'source_records': ['ia:test_item'],
+        'title': 'Test item',
+        'languages': ['eng'],
+    }
+    reply = load(rec)
+    assert reply['success'] is True
+    assert reply['edition']['status'] == 'created'
+    e = mock_site.get(reply['edition']['key'])
+    assert e.type.key == '/type/edition'
+    assert e.source_records == ['ia:test_item']
+
+    matching_rec = {
+        'ocaid': 'test_item',
+        'source_records': ['ia:test_item'],
+        # Titles MUST match to be considered the same
+        'title': 'Test item',
+        'languages': ['fre', 'eng'],
+    }
+    reply = load(matching_rec)
+    assert reply['success'] is True
+    assert reply['edition']['status'] == 'modified'
+    updated_e = mock_site.get(reply['edition']['key'])
+    updated_languages = [lang['key'] for lang in updated_e.languages]
+    assert updated_languages == ['/languages/eng', '/languages/fre']
+
+
+def test_matched_edition_with_new_language_is_added_even_if_no_existing_language(
+    mock_site, add_languages, ia_writeback
+):
+    """
+    Ensure a new language is added even if the existing edition has no language
+    field.
+    """
     rec = {
+        'ocaid': 'test_item',
+        'source_records': ['ia:test_item'],
+        'title': 'Test item',
+    }
+    reply = load(rec)
+    assert reply['success'] is True
+    assert reply['edition']['status'] == 'created'
+    e = mock_site.get(reply['edition']['key'])
+    assert e.type.key == '/type/edition'
+    assert e.source_records == ['ia:test_item']
+
+    matching_rec = {
         'ocaid': 'test_item',
         'source_records': ['ia:test_item'],
         # Titles MUST match to be considered the same
         'title': 'Test item',
-        'languages': ['fre'],
+        'languages': ['eng'],
+    }
+    reply = load(matching_rec)
+    assert reply['success'] is True
+    assert reply['edition']['status'] == 'modified'
+    updated_edition = mock_site.get(reply['edition']['key'])
+    updated_languages = [lang['key'] for lang in updated_edition.languages]
+    assert updated_languages == ['/languages/eng']
+
+
+def test_matched_edition_properly_updates_non_language_fields(
+    mock_site, add_languages, ia_writeback
+):
+    """
+    Ensure a new language is added even if the existing edition has no language
+    field.
+    """
+    rec = {
+        'ocaid': 'test_item',
+        'source_records': ['ia:test_item'],
+        'title': 'Test item',
     }
     reply = load(rec)
     assert reply['success'] is True
-    assert reply['edition']['status'] == 'matched'
+    assert reply['edition']['status'] == 'created'
+    e = mock_site.get(reply['edition']['key'])
+    assert e.type.key == '/type/edition'
+    assert e.source_records == ['ia:test_item']
+
+    matching_rec = {
+        'ocaid': 'test_item',
+        'source_records': ['test:1234567890'],  # updated existing field in edition.
+        'title': 'Test item',
+        'lc_classifications': ['PQ2671.A58'],  # new field not present in edition.
+    }
+    reply = load(matching_rec)
+    assert reply['success'] is True
+    assert reply['edition']['status'] == 'modified'
+    updated_edition = mock_site.get(reply['edition']['key'])
+
+    expected_source_records = ['ia:test_item', 'test:1234567890']
+    expected_lc_classifications = ['PQ2671.A58']
+
+    assert expected_source_records == updated_edition.source_records
+    assert expected_lc_classifications == updated_edition.lc_classifications
 
 
 class Test_From_MARC:

diff --git a/openlibrary/catalog/add_book/tests/test_load_book.py b/openlibrary/catalog/add_book/tests/test_load_book.py
@@ -2,12 +2,12 @@
 
 from openlibrary.catalog.add_book import load_book
 from openlibrary.catalog.add_book.load_book import (
-    InvalidLanguage,
     build_query,
     find_entity,
     import_author,
     remove_author_honorifics,
 )
+from openlibrary.catalog.utils import InvalidLanguage
 from openlibrary.core.models import Author
 
 

diff --git a/openlibrary/catalog/utils/__init__.py b/openlibrary/catalog/utils/__init__.py
@@ -1,5 +1,6 @@
 import datetime
 import re
+from collections.abc import Iterable
 from typing import TYPE_CHECKING
 from unicodedata import normalize
 
@@ -434,3 +435,30 @@ def get_missing_fields(rec: dict) -> list[str]:
         'source_records',
     ]
     return [field for field in required_fields if rec.get(field) is None]
+
+
+class InvalidLanguage(Exception):
+    def __init__(self, code):
+        self.code = code
+
+    def __str__(self):
+        return f"invalid language code: '{self.code}'"
+
+
+def format_languages(languages: Iterable) -> list[dict[str, str]]:
+    """
+    Format language data to match Open Library's expected format.
+    For an input of ["eng", "fre"], return:
+    [{'key': '/languages/eng'}, {'key': '/languages/fre'}]
+    """
+    if not languages:
+        return []
+
+    formatted_languages = []
+    for language in languages:
+        if web.ctx.site.get(f"/languages/{language.lower()}") is None:
+            raise InvalidLanguage(language.lower())
+
+        formatted_languages.append({'key': f'/languages/{language.lower()}'})
+
+    return formatted_languages
diff --git a/openlibrary/core/lists/model.py b/openlibrary/core/lists/model.py
@@ -198,7 +198,7 @@ def get_export_list(self) -> dict[str, list[dict]]:
         things = cast(
             list[Thing],
             web.ctx.site.get_many(
-                [seed.key for seed in self.seeds if isinstance(seed, Thing)]
+                [seed.key for seed in self.get_seeds() if seed._type != "subject"]
             ),
         )