From 907ba6fdea0341e8d0f429eaf6aaa404dbc7daff Mon Sep 17 00:00:00 2001 From: Ben Haney <31331498+benhaney@users.noreply.github.com> Date: Fri, 31 Jan 2025 09:04:34 -0600 Subject: [PATCH] feat(api): make rottentomatoes matching more robust (#1265) --- package.json | 2 + pnpm-lock.yaml | 16 +++++ server/api/rating/rottentomatoes.ts | 104 +++++++++++++++------------- 3 files changed, 72 insertions(+), 50 deletions(-) diff --git a/package.json b/package.json index cba4d61cf..6e6500ede 100644 --- a/package.json +++ b/package.json @@ -42,6 +42,7 @@ "@supercharge/request-ip": "1.2.0", "@svgr/webpack": "6.5.1", "@tanem/react-nprogress": "5.0.30", + "@types/wink-jaro-distance": "^2.0.2", "ace-builds": "1.15.2", "bcrypt": "5.1.0", "bowser": "2.11.0", @@ -97,6 +98,7 @@ "typeorm": "0.3.11", "undici": "^6.20.1", "web-push": "3.5.0", + "wink-jaro-distance": "^2.0.0", "winston": "3.8.2", "winston-daily-rotate-file": "4.7.1", "xml2js": "0.4.23", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f99449432..de1247df5 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -38,6 +38,9 @@ importers: '@tanem/react-nprogress': specifier: 5.0.30 version: 5.0.30(react-dom@18.3.1(react@18.3.1))(react@18.3.1) + '@types/wink-jaro-distance': + specifier: ^2.0.2 + version: 2.0.2 ace-builds: specifier: 1.15.2 version: 1.15.2 @@ -203,6 +206,9 @@ importers: web-push: specifier: 3.5.0 version: 3.5.0 + wink-jaro-distance: + specifier: ^2.0.0 + version: 2.0.0 winston: specifier: 3.8.2 version: 3.8.2 @@ -3250,6 +3256,9 @@ packages: '@types/webxr@0.5.20': resolution: {integrity: sha512-JGpU6qiIJQKUuVSKx1GtQnHJGxRjtfGIhzO2ilq43VZZS//f1h1Sgexbdk+Lq+7569a6EYhOWrUpIruR/1Enmg==} + '@types/wink-jaro-distance@2.0.2': + resolution: {integrity: sha512-Q79orp7qA/g/uLdFmqd5MtEa0ZfJW5X1WXikAu8IVHt24IrHWrcTNYNdPpLK5mwVg34C6FQnrv/DMtcUhjE/zA==} + '@types/xml2js@0.4.11': resolution: {integrity: sha512-JdigeAKmCyoJUiQljjr7tQG3if9NkqGUgwEUqBvV0N7LM4HyQk7UXCnusRa1lnvXAEYJ8mw8GtZWioagNztOwA==} @@ -9467,6 +9476,9 @@ packages: wide-align@1.1.5: resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==} + wink-jaro-distance@2.0.0: + resolution: {integrity: sha512-9bcUaXCi9N8iYpGWbFkf83OsBkg17r4hEyxusEzl+nnReLRPqxhB9YNeRn3g54SYnVRNXP029lY3HDsbdxTAuA==} + winston-daily-rotate-file@4.7.1: resolution: {integrity: sha512-7LGPiYGBPNyGHLn9z33i96zx/bd71pjBn9tqQzO3I4Tayv94WPmBNwKC7CO1wPHdP9uvu+Md/1nr6VSH9h0iaA==} engines: {node: '>=8'} @@ -13737,6 +13749,8 @@ snapshots: '@types/webxr@0.5.20': {} + '@types/wink-jaro-distance@2.0.2': {} + '@types/xml2js@0.4.11': dependencies: '@types/node': 22.10.5 @@ -20905,6 +20919,8 @@ snapshots: dependencies: string-width: 4.2.3 + wink-jaro-distance@2.0.0: {} + winston-daily-rotate-file@4.7.1(winston@3.8.2): dependencies: file-stream-rotator: 0.6.1 diff --git a/server/api/rating/rottentomatoes.ts b/server/api/rating/rottentomatoes.ts index 170cbb64f..bfded7671 100644 --- a/server/api/rating/rottentomatoes.ts +++ b/server/api/rating/rottentomatoes.ts @@ -1,6 +1,7 @@ import ExternalAPI from '@server/api/externalapi'; import cacheManager from '@server/lib/cache'; import { getSettings } from '@server/lib/settings'; +import jaro from 'wink-jaro-distance'; interface RTAlgoliaSearchResponse { results: { @@ -15,7 +16,7 @@ interface RTAlgoliaHit { tmsId: string; type: string; title: string; - titles: string[]; + titles?: string[]; description: string; releaseYear: number; rating: string; @@ -24,9 +25,9 @@ interface RTAlgoliaHit { isEmsSearchable: boolean; rtId: number; vanity: string; - aka: string[]; + aka?: string[]; posterImageUrl: string; - rottenTomatoes: { + rottenTomatoes?: { audienceScore: number; criticsIconUrl: string; wantToSeeCount: number; @@ -47,6 +48,47 @@ export interface RTRating { url: string; } +// Tunables +const INEXACT_TITLE_FACTOR = 0.25; +const ALTERNATE_TITLE_FACTOR = 0.8; +const PER_YEAR_PENALTY = 0.4; +const MINIMUM_SCORE = 0.175; + +// Normalization for title comparisons. +// Lowercase and strip non-alphanumeric (unicode-aware). +const norm = (s: string): string => + s.toLowerCase().replace(/[^\p{L}\p{N} ]/gu, ''); + +// Title similarity. 1 if exact, quarter-jaro otherwise. +const similarity = (a: string, b: string): number => + a === b ? 1 : jaro(a, b).similarity * INEXACT_TITLE_FACTOR; + +// Gets the best similarity score between the searched title and all alternate +// titles of the search result. Non-main titles are penalized. +const t_score = ({ title, titles, aka }: RTAlgoliaHit, s: string): number => { + const f = (t: string, i: number) => + similarity(norm(t), norm(s)) * (i ? ALTERNATE_TITLE_FACTOR : 1); + return Math.max(...[title].concat(aka || [], titles || []).map(f)); +}; + +// Year difference to score: 0 -> 1.0, 1 -> 0.6, 2 -> 0.2, 3+ -> 0.0 +const y_score = (r: RTAlgoliaHit, y?: number): number => + y ? Math.max(0, 1 - Math.abs(r.releaseYear - y) * PER_YEAR_PENALTY) : 1; + +// Cut score in half if result has no ratings. +const extra_score = (r: RTAlgoliaHit): number => (r.rottenTomatoes ? 1 : 0.5); + +// Score search result as product of all subscores +const score = (r: RTAlgoliaHit, name: string, year?: number): number => + t_score(r, name) * y_score(r, year) * extra_score(r); + +// Score each search result and return the highest scoring result, if any +const best = (rs: RTAlgoliaHit[], name: string, year?: number): RTAlgoliaHit => + rs + .map((r) => ({ score: score(r, name, year), result: r })) + .filter(({ score }) => score > MINIMUM_SCORE) + .sort(({ score: a }, { score: b }) => b - a)[0]?.result; + /** * This is a best-effort API. The Rotten Tomatoes API is technically * private and getting access costs money/requires approval. @@ -90,47 +132,21 @@ class RottenTomatoes extends ExternalAPI { year: number ): Promise { try { + const filters = encodeURIComponent('isEmsSearchable=1 AND type:"movie"'); const data = await this.post('/queries', { requests: [ { indexName: 'content_rt', - query: name, - params: 'filters=isEmsSearchable%20%3D%201&hitsPerPage=20', + query: name.replace(/\bthe\b ?/gi, ''), + params: `filters=${filters}&hitsPerPage=20`, }, ], }); const contentResults = data.results.find((r) => r.index === 'content_rt'); + const movie = best(contentResults?.hits || [], name, year); - if (!contentResults) { - return null; - } - - // First, attempt to match exact name and year - let movie = contentResults.hits.find( - (movie) => movie.releaseYear === year && movie.title === name - ); - - // If we don't find a movie, try to match partial name and year - if (!movie) { - movie = contentResults.hits.find( - (movie) => movie.releaseYear === year && movie.title.includes(name) - ); - } - - // If we still dont find a movie, try to match just on year - if (!movie) { - movie = contentResults.hits.find((movie) => movie.releaseYear === year); - } - - // One last try, try exact name match only - if (!movie) { - movie = contentResults.hits.find((movie) => movie.title === name); - } - - if (!movie?.rottenTomatoes) { - return null; - } + if (!movie?.rottenTomatoes) return null; return { title: movie.title, @@ -158,33 +174,21 @@ class RottenTomatoes extends ExternalAPI { year?: number ): Promise { try { + const filters = encodeURIComponent('isEmsSearchable=1 AND type:"tv"'); const data = await this.post('/queries', { requests: [ { indexName: 'content_rt', query: name, - params: 'filters=isEmsSearchable%20%3D%201&hitsPerPage=20', + params: `filters=${filters}&hitsPerPage=20`, }, ], }); const contentResults = data.results.find((r) => r.index === 'content_rt'); + const tvshow = best(contentResults?.hits || [], name, year); - if (!contentResults) { - return null; - } - - let tvshow: RTAlgoliaHit | undefined = contentResults.hits[0]; - - if (year) { - tvshow = contentResults.hits.find( - (series) => series.releaseYear === year - ); - } - - if (!tvshow || !tvshow.rottenTomatoes) { - return null; - } + if (!tvshow?.rottenTomatoes) return null; return { title: tvshow.title,