feat(api): make rottentomatoes matching more robust (#1265)

This commit is contained in:
Ben Haney
2025-01-31 09:04:34 -06:00
committed by GitHub
parent efaad21554
commit 907ba6fdea
3 changed files with 72 additions and 50 deletions

View File

@@ -42,6 +42,7 @@
"@supercharge/request-ip": "1.2.0",
"@svgr/webpack": "6.5.1",
"@tanem/react-nprogress": "5.0.30",
"@types/wink-jaro-distance": "^2.0.2",
"ace-builds": "1.15.2",
"bcrypt": "5.1.0",
"bowser": "2.11.0",
@@ -97,6 +98,7 @@
"typeorm": "0.3.11",
"undici": "^6.20.1",
"web-push": "3.5.0",
"wink-jaro-distance": "^2.0.0",
"winston": "3.8.2",
"winston-daily-rotate-file": "4.7.1",
"xml2js": "0.4.23",

16
pnpm-lock.yaml generated
View File

@@ -38,6 +38,9 @@ importers:
'@tanem/react-nprogress':
specifier: 5.0.30
version: 5.0.30(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
'@types/wink-jaro-distance':
specifier: ^2.0.2
version: 2.0.2
ace-builds:
specifier: 1.15.2
version: 1.15.2
@@ -203,6 +206,9 @@ importers:
web-push:
specifier: 3.5.0
version: 3.5.0
wink-jaro-distance:
specifier: ^2.0.0
version: 2.0.0
winston:
specifier: 3.8.2
version: 3.8.2
@@ -3250,6 +3256,9 @@ packages:
'@types/webxr@0.5.20':
resolution: {integrity: sha512-JGpU6qiIJQKUuVSKx1GtQnHJGxRjtfGIhzO2ilq43VZZS//f1h1Sgexbdk+Lq+7569a6EYhOWrUpIruR/1Enmg==}
'@types/wink-jaro-distance@2.0.2':
resolution: {integrity: sha512-Q79orp7qA/g/uLdFmqd5MtEa0ZfJW5X1WXikAu8IVHt24IrHWrcTNYNdPpLK5mwVg34C6FQnrv/DMtcUhjE/zA==}
'@types/xml2js@0.4.11':
resolution: {integrity: sha512-JdigeAKmCyoJUiQljjr7tQG3if9NkqGUgwEUqBvV0N7LM4HyQk7UXCnusRa1lnvXAEYJ8mw8GtZWioagNztOwA==}
@@ -9467,6 +9476,9 @@ packages:
wide-align@1.1.5:
resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==}
wink-jaro-distance@2.0.0:
resolution: {integrity: sha512-9bcUaXCi9N8iYpGWbFkf83OsBkg17r4hEyxusEzl+nnReLRPqxhB9YNeRn3g54SYnVRNXP029lY3HDsbdxTAuA==}
winston-daily-rotate-file@4.7.1:
resolution: {integrity: sha512-7LGPiYGBPNyGHLn9z33i96zx/bd71pjBn9tqQzO3I4Tayv94WPmBNwKC7CO1wPHdP9uvu+Md/1nr6VSH9h0iaA==}
engines: {node: '>=8'}
@@ -13737,6 +13749,8 @@ snapshots:
'@types/webxr@0.5.20': {}
'@types/wink-jaro-distance@2.0.2': {}
'@types/xml2js@0.4.11':
dependencies:
'@types/node': 22.10.5
@@ -20905,6 +20919,8 @@ snapshots:
dependencies:
string-width: 4.2.3
wink-jaro-distance@2.0.0: {}
winston-daily-rotate-file@4.7.1(winston@3.8.2):
dependencies:
file-stream-rotator: 0.6.1

View File

@@ -1,6 +1,7 @@
import ExternalAPI from '@server/api/externalapi';
import cacheManager from '@server/lib/cache';
import { getSettings } from '@server/lib/settings';
import jaro from 'wink-jaro-distance';
interface RTAlgoliaSearchResponse {
results: {
@@ -15,7 +16,7 @@ interface RTAlgoliaHit {
tmsId: string;
type: string;
title: string;
titles: string[];
titles?: string[];
description: string;
releaseYear: number;
rating: string;
@@ -24,9 +25,9 @@ interface RTAlgoliaHit {
isEmsSearchable: boolean;
rtId: number;
vanity: string;
aka: string[];
aka?: string[];
posterImageUrl: string;
rottenTomatoes: {
rottenTomatoes?: {
audienceScore: number;
criticsIconUrl: string;
wantToSeeCount: number;
@@ -47,6 +48,47 @@ export interface RTRating {
url: string;
}
// Tunables
const INEXACT_TITLE_FACTOR = 0.25;
const ALTERNATE_TITLE_FACTOR = 0.8;
const PER_YEAR_PENALTY = 0.4;
const MINIMUM_SCORE = 0.175;
// Normalization for title comparisons.
// Lowercase and strip non-alphanumeric (unicode-aware).
const norm = (s: string): string =>
s.toLowerCase().replace(/[^\p{L}\p{N} ]/gu, '');
// Title similarity. 1 if exact, quarter-jaro otherwise.
const similarity = (a: string, b: string): number =>
a === b ? 1 : jaro(a, b).similarity * INEXACT_TITLE_FACTOR;
// Gets the best similarity score between the searched title and all alternate
// titles of the search result. Non-main titles are penalized.
const t_score = ({ title, titles, aka }: RTAlgoliaHit, s: string): number => {
const f = (t: string, i: number) =>
similarity(norm(t), norm(s)) * (i ? ALTERNATE_TITLE_FACTOR : 1);
return Math.max(...[title].concat(aka || [], titles || []).map(f));
};
// Year difference to score: 0 -> 1.0, 1 -> 0.6, 2 -> 0.2, 3+ -> 0.0
const y_score = (r: RTAlgoliaHit, y?: number): number =>
y ? Math.max(0, 1 - Math.abs(r.releaseYear - y) * PER_YEAR_PENALTY) : 1;
// Cut score in half if result has no ratings.
const extra_score = (r: RTAlgoliaHit): number => (r.rottenTomatoes ? 1 : 0.5);
// Score search result as product of all subscores
const score = (r: RTAlgoliaHit, name: string, year?: number): number =>
t_score(r, name) * y_score(r, year) * extra_score(r);
// Score each search result and return the highest scoring result, if any
const best = (rs: RTAlgoliaHit[], name: string, year?: number): RTAlgoliaHit =>
rs
.map((r) => ({ score: score(r, name, year), result: r }))
.filter(({ score }) => score > MINIMUM_SCORE)
.sort(({ score: a }, { score: b }) => b - a)[0]?.result;
/**
* This is a best-effort API. The Rotten Tomatoes API is technically
* private and getting access costs money/requires approval.
@@ -90,47 +132,21 @@ class RottenTomatoes extends ExternalAPI {
year: number
): Promise<RTRating | null> {
try {
const filters = encodeURIComponent('isEmsSearchable=1 AND type:"movie"');
const data = await this.post<RTAlgoliaSearchResponse>('/queries', {
requests: [
{
indexName: 'content_rt',
query: name,
params: 'filters=isEmsSearchable%20%3D%201&hitsPerPage=20',
query: name.replace(/\bthe\b ?/gi, ''),
params: `filters=${filters}&hitsPerPage=20`,
},
],
});
const contentResults = data.results.find((r) => r.index === 'content_rt');
const movie = best(contentResults?.hits || [], name, year);
if (!contentResults) {
return null;
}
// First, attempt to match exact name and year
let movie = contentResults.hits.find(
(movie) => movie.releaseYear === year && movie.title === name
);
// If we don't find a movie, try to match partial name and year
if (!movie) {
movie = contentResults.hits.find(
(movie) => movie.releaseYear === year && movie.title.includes(name)
);
}
// If we still dont find a movie, try to match just on year
if (!movie) {
movie = contentResults.hits.find((movie) => movie.releaseYear === year);
}
// One last try, try exact name match only
if (!movie) {
movie = contentResults.hits.find((movie) => movie.title === name);
}
if (!movie?.rottenTomatoes) {
return null;
}
if (!movie?.rottenTomatoes) return null;
return {
title: movie.title,
@@ -158,33 +174,21 @@ class RottenTomatoes extends ExternalAPI {
year?: number
): Promise<RTRating | null> {
try {
const filters = encodeURIComponent('isEmsSearchable=1 AND type:"tv"');
const data = await this.post<RTAlgoliaSearchResponse>('/queries', {
requests: [
{
indexName: 'content_rt',
query: name,
params: 'filters=isEmsSearchable%20%3D%201&hitsPerPage=20',
params: `filters=${filters}&hitsPerPage=20`,
},
],
});
const contentResults = data.results.find((r) => r.index === 'content_rt');
const tvshow = best(contentResults?.hits || [], name, year);
if (!contentResults) {
return null;
}
let tvshow: RTAlgoliaHit | undefined = contentResults.hits[0];
if (year) {
tvshow = contentResults.hits.find(
(series) => series.releaseYear === year
);
}
if (!tvshow || !tvshow.rottenTomatoes) {
return null;
}
if (!tvshow?.rottenTomatoes) return null;
return {
title: tvshow.title,