Files
seanime-docker/seanime-2.9.10/internal/library/scanner/matcher.go
2025-09-20 14:08:38 +01:00

553 lines
17 KiB
Go

package scanner
import (
"errors"
"fmt"
"math"
"seanime/internal/api/anilist"
"seanime/internal/hook"
"seanime/internal/library/anime"
"seanime/internal/library/summary"
"seanime/internal/util"
"seanime/internal/util/comparison"
"time"
"github.com/adrg/strutil/metrics"
"github.com/rs/zerolog"
"github.com/samber/lo"
lop "github.com/samber/lo/parallel"
"github.com/sourcegraph/conc/pool"
)
type Matcher struct {
LocalFiles []*anime.LocalFile
MediaContainer *MediaContainer
CompleteAnimeCache *anilist.CompleteAnimeCache
Logger *zerolog.Logger
ScanLogger *ScanLogger
ScanSummaryLogger *summary.ScanSummaryLogger // optional
Algorithm string
Threshold float64
}
var (
ErrNoLocalFiles = errors.New("[matcher] no local files")
)
// MatchLocalFilesWithMedia will match each anime.LocalFile with a specific anilist.BaseAnime and modify the LocalFile's `mediaId`
func (m *Matcher) MatchLocalFilesWithMedia() error {
if m.Threshold == 0 {
m.Threshold = 0.5
}
start := time.Now()
if len(m.LocalFiles) == 0 {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.WarnLevel).Msg("No local files")
}
return ErrNoLocalFiles
}
if len(m.MediaContainer.allMedia) == 0 {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.WarnLevel).Msg("No media fed into the matcher")
}
return errors.New("[matcher] no media fed into the matcher")
}
m.Logger.Debug().Msg("matcher: Starting matching process")
// Invoke ScanMatchingStarted hook
event := &ScanMatchingStartedEvent{
LocalFiles: m.LocalFiles,
NormalizedMedia: m.MediaContainer.NormalizedMedia,
Algorithm: m.Algorithm,
Threshold: m.Threshold,
}
_ = hook.GlobalHookManager.OnScanMatchingStarted().Trigger(event)
m.LocalFiles = event.LocalFiles
m.MediaContainer.NormalizedMedia = event.NormalizedMedia
m.Algorithm = event.Algorithm
m.Threshold = event.Threshold
if event.DefaultPrevented {
m.Logger.Debug().Msg("matcher: Match stopped by hook")
return nil
}
// Parallelize the matching process
lop.ForEach(m.LocalFiles, func(localFile *anime.LocalFile, _ int) {
m.matchLocalFileWithMedia(localFile)
})
// m.validateMatches()
// Invoke ScanMatchingCompleted hook
completedEvent := &ScanMatchingCompletedEvent{
LocalFiles: m.LocalFiles,
}
_ = hook.GlobalHookManager.OnScanMatchingCompleted().Trigger(completedEvent)
m.LocalFiles = completedEvent.LocalFiles
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.InfoLevel).
Int64("ms", time.Since(start).Milliseconds()).
Int("files", len(m.LocalFiles)).
Int("unmatched", lo.CountBy(m.LocalFiles, func(localFile *anime.LocalFile) bool {
return localFile.MediaId == 0
})).
Msg("Finished matching process")
}
return nil
}
// matchLocalFileWithMedia finds the best match for the local file
// If the best match is above a certain threshold, set the local file's mediaId to the best match's id
// If the best match is below a certain threshold, leave the local file's mediaId to 0
func (m *Matcher) matchLocalFileWithMedia(lf *anime.LocalFile) {
defer util.HandlePanicInModuleThenS("scanner/matcher/matchLocalFileWithMedia", func(stackTrace string) {
lf.MediaId = 0
/*Log*/
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.ErrorLevel).
Str("filename", lf.Name).
Msg("Panic occurred, file un-matched")
}
m.ScanSummaryLogger.LogPanic(lf, stackTrace)
})
// Check if the local file has already been matched
if lf.MediaId != 0 {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Msg("File already matched")
}
m.ScanSummaryLogger.LogFileNotMatched(lf, "Already matched")
return
}
// Check if the local file has a title
if lf.GetParsedTitle() == "" {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.WarnLevel).
Str("filename", lf.Name).
Msg("File has no parsed title")
}
m.ScanSummaryLogger.LogFileNotMatched(lf, "No parsed title found")
return
}
// Create title variations
// Check cache for title variation
titleVariations := lf.GetTitleVariations()
if len(titleVariations) == 0 {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.WarnLevel).
Str("filename", lf.Name).
Msg("No titles found")
}
m.ScanSummaryLogger.LogFileNotMatched(lf, "No title variations found")
return
}
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Interface("titleVariations", titleVariations).
Msg("Matching local file")
}
m.ScanSummaryLogger.LogDebug(lf, util.InlineSpewT(titleVariations))
//------------------
var levMatch *comparison.LevenshteinResult
var sdMatch *comparison.SorensenDiceResult
var jaccardMatch *comparison.JaccardResult
if m.Algorithm == "jaccard" {
// Using Jaccard
// Get the matchs for each title variation
compResults := lop.Map(titleVariations, func(title *string, _ int) *comparison.JaccardResult {
comps := make([]*comparison.JaccardResult, 0)
if len(m.MediaContainer.engTitles) > 0 {
if eng, found := comparison.FindBestMatchWithJaccard(title, m.MediaContainer.engTitles); found {
comps = append(comps, eng)
}
}
if len(m.MediaContainer.romTitles) > 0 {
if rom, found := comparison.FindBestMatchWithJaccard(title, m.MediaContainer.romTitles); found {
comps = append(comps, rom)
}
}
if len(m.MediaContainer.synonyms) > 0 {
if syn, found := comparison.FindBestMatchWithJaccard(title, m.MediaContainer.synonyms); found {
comps = append(comps, syn)
}
}
var res *comparison.JaccardResult
if len(comps) > 1 {
res = lo.Reduce(comps, func(prev *comparison.JaccardResult, curr *comparison.JaccardResult, _ int) *comparison.JaccardResult {
if prev.Rating > curr.Rating {
return prev
} else {
return curr
}
}, comps[0])
} else if len(comps) == 1 {
return comps[0]
}
return res
})
// Retrieve the match from all the title variations results
jaccardMatch = lo.Reduce(compResults, func(prev *comparison.JaccardResult, curr *comparison.JaccardResult, _ int) *comparison.JaccardResult {
if prev.Rating > curr.Rating {
return prev
} else {
return curr
}
}, compResults[0])
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Interface("match", jaccardMatch).
Interface("results", compResults).
Msg("Jaccard match")
}
m.ScanSummaryLogger.LogComparison(lf, "Jaccard", *jaccardMatch.Value, "Rating", util.InlineSpewT(jaccardMatch.Rating))
} else if m.Algorithm == "sorensen-dice" {
// Using Sorensen-Dice
// Get the matchs for each title variation
compResults := lop.Map(titleVariations, func(title *string, _ int) *comparison.SorensenDiceResult {
comps := make([]*comparison.SorensenDiceResult, 0)
if len(m.MediaContainer.engTitles) > 0 {
if eng, found := comparison.FindBestMatchWithSorensenDice(title, m.MediaContainer.engTitles); found {
comps = append(comps, eng)
}
}
if len(m.MediaContainer.romTitles) > 0 {
if rom, found := comparison.FindBestMatchWithSorensenDice(title, m.MediaContainer.romTitles); found {
comps = append(comps, rom)
}
}
if len(m.MediaContainer.synonyms) > 0 {
if syn, found := comparison.FindBestMatchWithSorensenDice(title, m.MediaContainer.synonyms); found {
comps = append(comps, syn)
}
}
var res *comparison.SorensenDiceResult
if len(comps) > 1 {
res = lo.Reduce(comps, func(prev *comparison.SorensenDiceResult, curr *comparison.SorensenDiceResult, _ int) *comparison.SorensenDiceResult {
if prev.Rating > curr.Rating {
return prev
} else {
return curr
}
}, comps[0])
} else if len(comps) == 1 {
return comps[0]
}
return res
})
// Retrieve the match from all the title variations results
sdMatch = lo.Reduce(compResults, func(prev *comparison.SorensenDiceResult, curr *comparison.SorensenDiceResult, _ int) *comparison.SorensenDiceResult {
if prev.Rating > curr.Rating {
return prev
} else {
return curr
}
}, compResults[0])
//util.Spew(compResults)
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Interface("match", sdMatch).
Interface("results", compResults).
Msg("Sorensen-Dice match")
}
m.ScanSummaryLogger.LogComparison(lf, "Sorensen-Dice", *sdMatch.Value, "Rating", util.InlineSpewT(sdMatch.Rating))
} else {
// Using Levenshtein
// Get the matches for each title variation
levCompResults := lop.Map(titleVariations, func(title *string, _ int) *comparison.LevenshteinResult {
comps := make([]*comparison.LevenshteinResult, 0)
if len(m.MediaContainer.engTitles) > 0 {
if eng, found := comparison.FindBestMatchWithLevenshtein(title, m.MediaContainer.engTitles); found {
comps = append(comps, eng)
}
}
if len(m.MediaContainer.romTitles) > 0 {
if rom, found := comparison.FindBestMatchWithLevenshtein(title, m.MediaContainer.romTitles); found {
comps = append(comps, rom)
}
}
if len(m.MediaContainer.synonyms) > 0 {
if syn, found := comparison.FindBestMatchWithLevenshtein(title, m.MediaContainer.synonyms); found {
comps = append(comps, syn)
}
}
var res *comparison.LevenshteinResult
if len(comps) > 1 {
res = lo.Reduce(comps, func(prev *comparison.LevenshteinResult, curr *comparison.LevenshteinResult, _ int) *comparison.LevenshteinResult {
if prev.Distance < curr.Distance {
return prev
} else {
return curr
}
}, comps[0])
} else if len(comps) == 1 {
return comps[0]
}
return res
})
levMatch = lo.Reduce(levCompResults, func(prev *comparison.LevenshteinResult, curr *comparison.LevenshteinResult, _ int) *comparison.LevenshteinResult {
if prev.Distance < curr.Distance {
return prev
} else {
return curr
}
}, levCompResults[0])
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Interface("match", levMatch).
Interface("results", levCompResults).
Int("distance", levMatch.Distance).
Msg("Levenshtein match")
}
m.ScanSummaryLogger.LogComparison(lf, "Levenshtein", *levMatch.Value, "Distance", util.InlineSpewT(levMatch.Distance))
}
//------------------
var mediaMatch *anime.NormalizedMedia
var found bool
finalRating := 0.0
if sdMatch != nil {
finalRating = sdMatch.Rating
mediaMatch, found = m.MediaContainer.GetMediaFromTitleOrSynonym(sdMatch.Value)
} else if jaccardMatch != nil {
finalRating = jaccardMatch.Rating
mediaMatch, found = m.MediaContainer.GetMediaFromTitleOrSynonym(jaccardMatch.Value)
} else {
dice := metrics.NewSorensenDice()
dice.CaseSensitive = false
dice.NgramSize = 1
finalRating = dice.Compare(*levMatch.OriginalValue, *levMatch.Value)
m.ScanSummaryLogger.LogComparison(lf, "Sorensen-Dice", *levMatch.Value, "Final rating", util.InlineSpewT(finalRating))
mediaMatch, found = m.MediaContainer.GetMediaFromTitleOrSynonym(levMatch.Value)
}
// After setting the mediaId, add the hook invocation
// Invoke ScanLocalFileMatched hook
event := &ScanLocalFileMatchedEvent{
LocalFile: lf,
Score: finalRating,
Match: mediaMatch,
Found: found,
}
hook.GlobalHookManager.OnScanLocalFileMatched().Trigger(event)
lf = event.LocalFile
mediaMatch = event.Match
found = event.Found
finalRating = event.Score
// Check if the hook overrode the match
if event.DefaultPrevented {
if m.ScanLogger != nil {
if mediaMatch != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Int("id", mediaMatch.ID).
Msg("Hook overrode match")
} else {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Msg("Hook overrode match, no match found")
}
}
if mediaMatch != nil {
lf.MediaId = mediaMatch.ID
} else {
lf.MediaId = 0
}
return
}
if !found {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.ErrorLevel).
Str("filename", lf.Name).
Msg("No media found from comparison result")
}
m.ScanSummaryLogger.LogFileNotMatched(lf, "No media found from comparison result")
return
}
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Str("title", mediaMatch.GetTitleSafe()).
Int("id", mediaMatch.ID).
Msg("Best match found")
}
if finalRating < m.Threshold {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Float64("rating", finalRating).
Float64("threshold", m.Threshold).
Msg("Best match Sorensen-Dice rating too low, un-matching file")
}
m.ScanSummaryLogger.LogFailedMatch(lf, "Rating too low, threshold is "+fmt.Sprintf("%f", m.Threshold))
return
}
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Str("filename", lf.Name).
Float64("rating", finalRating).
Float64("threshold", m.Threshold).
Msg("Best match rating high enough, matching file")
}
m.ScanSummaryLogger.LogSuccessfullyMatched(lf, mediaMatch.ID)
lf.MediaId = mediaMatch.ID
}
//----------------------------------------------------------------------------------------------------------------------
// validateMatches compares groups of local files' titles with the media titles and un-matches the local files that have a lower rating than the highest rating.
func (m *Matcher) validateMatches() {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.InfoLevel).Msg("Validating matches")
}
// Group local files by media ID
groups := lop.GroupBy(m.LocalFiles, func(localFile *anime.LocalFile) int {
return localFile.MediaId
})
// Remove the group with unmatched media
delete(groups, 0)
// Un-match files with lower ratings
p := pool.New()
for mId, files := range groups {
p.Go(func() {
if len(files) > 0 {
m.validateMatchGroup(mId, files)
}
})
}
p.Wait()
}
// validateMatchGroup compares the local files' titles under the same media
// with the media titles and un-matches the local files that have a lower rating.
// This is done to try and filter out wrong matches.
func (m *Matcher) validateMatchGroup(mediaId int, lfs []*anime.LocalFile) {
media, found := m.MediaContainer.GetMediaFromId(mediaId)
if !found {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.ErrorLevel).
Int("mediaId", mediaId).
Msg("Media not found in media container")
}
return
}
titles := media.GetAllTitles()
// Compare all files' parsed title with the media title
// Get the highest rating that will be used to un-match lower rated files
p := pool.NewWithResults[float64]()
for _, lf := range lfs {
p.Go(func() float64 {
t := lf.GetParsedTitle()
if comparison.ValueContainsSpecial(lf.Name) || comparison.ValueContainsNC(lf.Name) {
return 0
}
compRes, ok := comparison.FindBestMatchWithSorensenDice(&t, titles)
if ok {
return compRes.Rating
}
return 0
})
}
fileRatings := p.Wait()
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Int("mediaId", mediaId).
Any("fileRatings", fileRatings).
Msg("File ratings")
}
highestRating := lo.Reduce(fileRatings, func(prev float64, curr float64, _ int) float64 {
if prev > curr {
return prev
} else {
return curr
}
}, 0.0)
// Un-match files that have a lower rating than the ceiling
// UNLESS they are Special or NC
lop.ForEach(lfs, func(lf *anime.LocalFile, _ int) {
if !comparison.ValueContainsSpecial(lf.Name) && !comparison.ValueContainsNC(lf.Name) {
t := lf.GetParsedTitle()
if compRes, ok := comparison.FindBestMatchWithSorensenDice(&t, titles); ok {
// If the local file's rating is lower, un-match it
// Unless the difference is less than 0.7 (very lax since a lot of anime have very long names that can be truncated)
if compRes.Rating < highestRating && math.Abs(compRes.Rating-highestRating) > 0.7 {
lf.MediaId = 0
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.WarnLevel).
Int("mediaId", mediaId).
Str("filename", lf.Name).
Float64("rating", compRes.Rating).
Float64("highestRating", highestRating).
Msg("Rating does not match parameters, un-matching file")
}
m.ScanSummaryLogger.LogUnmatched(lf, fmt.Sprintf("Rating does not match parameters. File rating: %f, highest rating: %f", compRes.Rating, highestRating))
} else {
if m.ScanLogger != nil {
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
Int("mediaId", mediaId).
Str("filename", lf.Name).
Float64("rating", compRes.Rating).
Float64("highestRating", highestRating).
Msg("Rating matches parameters, keeping file matched")
}
m.ScanSummaryLogger.LogMatchValidated(lf, mediaId)
}
}
}
})
}