Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] 10390/fix/sanitize single double quote in solr query #10405

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions openlibrary/plugins/worksearch/schemes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
escape_unknown_fields,
fully_escape_query,
luqum_parser,
sanitize_solr_symbols,
)

logger = logging.getLogger("openlibrary.worksearch")
Expand Down Expand Up @@ -82,6 +83,7 @@ def process_user_query(self, q_param: str) -> str:
return q_param

try:
q_param = sanitize_solr_symbols(q_param, ['"'])
q_param = escape_unknown_fields(
(
# Solr 4+ has support for regexes (eg `key:/foo.*/`)! But for now,
Expand All @@ -104,6 +106,7 @@ def process_user_query(self, q_param: str) -> str:
q_tree = luqum_parser(fully_escape_query(q_param))

q_tree = self.transform_user_query(q_param, q_tree)

return str(q_tree)

def transform_user_query(
Expand Down
8 changes: 8 additions & 0 deletions openlibrary/plugins/worksearch/schemes/tests/test_works.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
'title:"food rules" author:pollan',
'alternative_title:"food rules" author_name:pollan',
),
'Unmatched double-quote': (
'title:Compilation Group for the "History of Modern China',
'alternative_title:(Compilation Group for the) "History of Modern China"',
),
'Unmatched double-quotes': (
'title:"Compilation Group for the "History of Modern China"',
'alternative_title:"Compilation Group for the "History of Modern China""',
),
'Leading text': (
'query here title:food rules author:pollan',
'query here alternative_title:(food rules) author_name:pollan',
Expand Down
9 changes: 9 additions & 0 deletions openlibrary/solr/query_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,15 @@ def fully_escape_query(query: str) -> str:
return escaped


# Ensure solr special symbols are valid in query
def sanitize_solr_symbols(query: str, chars: list[str]) -> str:
for c in chars:
# Handles unclosed symbols (i.e. adding closing '"' for Solr Terms)
if c in ('"') and query.count(c) % 2 == 1:
query = query + f'{c}'
return query


def luqum_parser(query: str) -> Item:
"""
Parses a lucene-like query, with the special binding rules of Open Library.
Expand Down