553 lines
17 KiB
Go
553 lines
17 KiB
Go
package scanner
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"seanime/internal/api/anilist"
|
|
"seanime/internal/hook"
|
|
"seanime/internal/library/anime"
|
|
"seanime/internal/library/summary"
|
|
"seanime/internal/util"
|
|
"seanime/internal/util/comparison"
|
|
"time"
|
|
|
|
"github.com/adrg/strutil/metrics"
|
|
"github.com/rs/zerolog"
|
|
"github.com/samber/lo"
|
|
lop "github.com/samber/lo/parallel"
|
|
"github.com/sourcegraph/conc/pool"
|
|
)
|
|
|
|
type Matcher struct {
|
|
LocalFiles []*anime.LocalFile
|
|
MediaContainer *MediaContainer
|
|
CompleteAnimeCache *anilist.CompleteAnimeCache
|
|
Logger *zerolog.Logger
|
|
ScanLogger *ScanLogger
|
|
ScanSummaryLogger *summary.ScanSummaryLogger // optional
|
|
Algorithm string
|
|
Threshold float64
|
|
}
|
|
|
|
var (
|
|
ErrNoLocalFiles = errors.New("[matcher] no local files")
|
|
)
|
|
|
|
// MatchLocalFilesWithMedia will match each anime.LocalFile with a specific anilist.BaseAnime and modify the LocalFile's `mediaId`
|
|
func (m *Matcher) MatchLocalFilesWithMedia() error {
|
|
|
|
if m.Threshold == 0 {
|
|
m.Threshold = 0.5
|
|
}
|
|
|
|
start := time.Now()
|
|
|
|
if len(m.LocalFiles) == 0 {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.WarnLevel).Msg("No local files")
|
|
}
|
|
return ErrNoLocalFiles
|
|
}
|
|
if len(m.MediaContainer.allMedia) == 0 {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.WarnLevel).Msg("No media fed into the matcher")
|
|
}
|
|
return errors.New("[matcher] no media fed into the matcher")
|
|
}
|
|
|
|
m.Logger.Debug().Msg("matcher: Starting matching process")
|
|
|
|
// Invoke ScanMatchingStarted hook
|
|
event := &ScanMatchingStartedEvent{
|
|
LocalFiles: m.LocalFiles,
|
|
NormalizedMedia: m.MediaContainer.NormalizedMedia,
|
|
Algorithm: m.Algorithm,
|
|
Threshold: m.Threshold,
|
|
}
|
|
_ = hook.GlobalHookManager.OnScanMatchingStarted().Trigger(event)
|
|
m.LocalFiles = event.LocalFiles
|
|
m.MediaContainer.NormalizedMedia = event.NormalizedMedia
|
|
m.Algorithm = event.Algorithm
|
|
m.Threshold = event.Threshold
|
|
|
|
if event.DefaultPrevented {
|
|
m.Logger.Debug().Msg("matcher: Match stopped by hook")
|
|
return nil
|
|
}
|
|
|
|
// Parallelize the matching process
|
|
lop.ForEach(m.LocalFiles, func(localFile *anime.LocalFile, _ int) {
|
|
m.matchLocalFileWithMedia(localFile)
|
|
})
|
|
|
|
// m.validateMatches()
|
|
|
|
// Invoke ScanMatchingCompleted hook
|
|
completedEvent := &ScanMatchingCompletedEvent{
|
|
LocalFiles: m.LocalFiles,
|
|
}
|
|
_ = hook.GlobalHookManager.OnScanMatchingCompleted().Trigger(completedEvent)
|
|
m.LocalFiles = completedEvent.LocalFiles
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.InfoLevel).
|
|
Int64("ms", time.Since(start).Milliseconds()).
|
|
Int("files", len(m.LocalFiles)).
|
|
Int("unmatched", lo.CountBy(m.LocalFiles, func(localFile *anime.LocalFile) bool {
|
|
return localFile.MediaId == 0
|
|
})).
|
|
Msg("Finished matching process")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// matchLocalFileWithMedia finds the best match for the local file
|
|
// If the best match is above a certain threshold, set the local file's mediaId to the best match's id
|
|
// If the best match is below a certain threshold, leave the local file's mediaId to 0
|
|
func (m *Matcher) matchLocalFileWithMedia(lf *anime.LocalFile) {
|
|
defer util.HandlePanicInModuleThenS("scanner/matcher/matchLocalFileWithMedia", func(stackTrace string) {
|
|
lf.MediaId = 0
|
|
/*Log*/
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.ErrorLevel).
|
|
Str("filename", lf.Name).
|
|
Msg("Panic occurred, file un-matched")
|
|
}
|
|
m.ScanSummaryLogger.LogPanic(lf, stackTrace)
|
|
})
|
|
|
|
// Check if the local file has already been matched
|
|
if lf.MediaId != 0 {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Msg("File already matched")
|
|
}
|
|
m.ScanSummaryLogger.LogFileNotMatched(lf, "Already matched")
|
|
return
|
|
}
|
|
// Check if the local file has a title
|
|
if lf.GetParsedTitle() == "" {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.WarnLevel).
|
|
Str("filename", lf.Name).
|
|
Msg("File has no parsed title")
|
|
}
|
|
m.ScanSummaryLogger.LogFileNotMatched(lf, "No parsed title found")
|
|
return
|
|
}
|
|
|
|
// Create title variations
|
|
// Check cache for title variation
|
|
|
|
titleVariations := lf.GetTitleVariations()
|
|
|
|
if len(titleVariations) == 0 {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.WarnLevel).
|
|
Str("filename", lf.Name).
|
|
Msg("No titles found")
|
|
}
|
|
m.ScanSummaryLogger.LogFileNotMatched(lf, "No title variations found")
|
|
return
|
|
}
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Interface("titleVariations", titleVariations).
|
|
Msg("Matching local file")
|
|
}
|
|
m.ScanSummaryLogger.LogDebug(lf, util.InlineSpewT(titleVariations))
|
|
|
|
//------------------
|
|
|
|
var levMatch *comparison.LevenshteinResult
|
|
var sdMatch *comparison.SorensenDiceResult
|
|
var jaccardMatch *comparison.JaccardResult
|
|
|
|
if m.Algorithm == "jaccard" {
|
|
// Using Jaccard
|
|
// Get the matchs for each title variation
|
|
compResults := lop.Map(titleVariations, func(title *string, _ int) *comparison.JaccardResult {
|
|
comps := make([]*comparison.JaccardResult, 0)
|
|
if len(m.MediaContainer.engTitles) > 0 {
|
|
if eng, found := comparison.FindBestMatchWithJaccard(title, m.MediaContainer.engTitles); found {
|
|
comps = append(comps, eng)
|
|
}
|
|
}
|
|
if len(m.MediaContainer.romTitles) > 0 {
|
|
if rom, found := comparison.FindBestMatchWithJaccard(title, m.MediaContainer.romTitles); found {
|
|
comps = append(comps, rom)
|
|
}
|
|
}
|
|
if len(m.MediaContainer.synonyms) > 0 {
|
|
if syn, found := comparison.FindBestMatchWithJaccard(title, m.MediaContainer.synonyms); found {
|
|
comps = append(comps, syn)
|
|
}
|
|
}
|
|
var res *comparison.JaccardResult
|
|
if len(comps) > 1 {
|
|
res = lo.Reduce(comps, func(prev *comparison.JaccardResult, curr *comparison.JaccardResult, _ int) *comparison.JaccardResult {
|
|
if prev.Rating > curr.Rating {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, comps[0])
|
|
} else if len(comps) == 1 {
|
|
return comps[0]
|
|
}
|
|
return res
|
|
})
|
|
|
|
// Retrieve the match from all the title variations results
|
|
jaccardMatch = lo.Reduce(compResults, func(prev *comparison.JaccardResult, curr *comparison.JaccardResult, _ int) *comparison.JaccardResult {
|
|
if prev.Rating > curr.Rating {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, compResults[0])
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Interface("match", jaccardMatch).
|
|
Interface("results", compResults).
|
|
Msg("Jaccard match")
|
|
}
|
|
m.ScanSummaryLogger.LogComparison(lf, "Jaccard", *jaccardMatch.Value, "Rating", util.InlineSpewT(jaccardMatch.Rating))
|
|
|
|
} else if m.Algorithm == "sorensen-dice" {
|
|
// Using Sorensen-Dice
|
|
// Get the matchs for each title variation
|
|
compResults := lop.Map(titleVariations, func(title *string, _ int) *comparison.SorensenDiceResult {
|
|
comps := make([]*comparison.SorensenDiceResult, 0)
|
|
if len(m.MediaContainer.engTitles) > 0 {
|
|
if eng, found := comparison.FindBestMatchWithSorensenDice(title, m.MediaContainer.engTitles); found {
|
|
comps = append(comps, eng)
|
|
}
|
|
}
|
|
if len(m.MediaContainer.romTitles) > 0 {
|
|
if rom, found := comparison.FindBestMatchWithSorensenDice(title, m.MediaContainer.romTitles); found {
|
|
comps = append(comps, rom)
|
|
}
|
|
}
|
|
if len(m.MediaContainer.synonyms) > 0 {
|
|
if syn, found := comparison.FindBestMatchWithSorensenDice(title, m.MediaContainer.synonyms); found {
|
|
comps = append(comps, syn)
|
|
}
|
|
}
|
|
var res *comparison.SorensenDiceResult
|
|
if len(comps) > 1 {
|
|
res = lo.Reduce(comps, func(prev *comparison.SorensenDiceResult, curr *comparison.SorensenDiceResult, _ int) *comparison.SorensenDiceResult {
|
|
if prev.Rating > curr.Rating {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, comps[0])
|
|
} else if len(comps) == 1 {
|
|
return comps[0]
|
|
}
|
|
return res
|
|
})
|
|
|
|
// Retrieve the match from all the title variations results
|
|
sdMatch = lo.Reduce(compResults, func(prev *comparison.SorensenDiceResult, curr *comparison.SorensenDiceResult, _ int) *comparison.SorensenDiceResult {
|
|
if prev.Rating > curr.Rating {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, compResults[0])
|
|
|
|
//util.Spew(compResults)
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Interface("match", sdMatch).
|
|
Interface("results", compResults).
|
|
Msg("Sorensen-Dice match")
|
|
}
|
|
m.ScanSummaryLogger.LogComparison(lf, "Sorensen-Dice", *sdMatch.Value, "Rating", util.InlineSpewT(sdMatch.Rating))
|
|
|
|
} else {
|
|
// Using Levenshtein
|
|
// Get the matches for each title variation
|
|
levCompResults := lop.Map(titleVariations, func(title *string, _ int) *comparison.LevenshteinResult {
|
|
comps := make([]*comparison.LevenshteinResult, 0)
|
|
if len(m.MediaContainer.engTitles) > 0 {
|
|
if eng, found := comparison.FindBestMatchWithLevenshtein(title, m.MediaContainer.engTitles); found {
|
|
comps = append(comps, eng)
|
|
}
|
|
}
|
|
if len(m.MediaContainer.romTitles) > 0 {
|
|
if rom, found := comparison.FindBestMatchWithLevenshtein(title, m.MediaContainer.romTitles); found {
|
|
comps = append(comps, rom)
|
|
}
|
|
}
|
|
if len(m.MediaContainer.synonyms) > 0 {
|
|
if syn, found := comparison.FindBestMatchWithLevenshtein(title, m.MediaContainer.synonyms); found {
|
|
comps = append(comps, syn)
|
|
}
|
|
}
|
|
var res *comparison.LevenshteinResult
|
|
if len(comps) > 1 {
|
|
res = lo.Reduce(comps, func(prev *comparison.LevenshteinResult, curr *comparison.LevenshteinResult, _ int) *comparison.LevenshteinResult {
|
|
if prev.Distance < curr.Distance {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, comps[0])
|
|
} else if len(comps) == 1 {
|
|
return comps[0]
|
|
}
|
|
return res
|
|
})
|
|
|
|
levMatch = lo.Reduce(levCompResults, func(prev *comparison.LevenshteinResult, curr *comparison.LevenshteinResult, _ int) *comparison.LevenshteinResult {
|
|
if prev.Distance < curr.Distance {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, levCompResults[0])
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Interface("match", levMatch).
|
|
Interface("results", levCompResults).
|
|
Int("distance", levMatch.Distance).
|
|
Msg("Levenshtein match")
|
|
}
|
|
m.ScanSummaryLogger.LogComparison(lf, "Levenshtein", *levMatch.Value, "Distance", util.InlineSpewT(levMatch.Distance))
|
|
}
|
|
|
|
//------------------
|
|
|
|
var mediaMatch *anime.NormalizedMedia
|
|
var found bool
|
|
finalRating := 0.0
|
|
|
|
if sdMatch != nil {
|
|
finalRating = sdMatch.Rating
|
|
mediaMatch, found = m.MediaContainer.GetMediaFromTitleOrSynonym(sdMatch.Value)
|
|
|
|
} else if jaccardMatch != nil {
|
|
finalRating = jaccardMatch.Rating
|
|
mediaMatch, found = m.MediaContainer.GetMediaFromTitleOrSynonym(jaccardMatch.Value)
|
|
|
|
} else {
|
|
dice := metrics.NewSorensenDice()
|
|
dice.CaseSensitive = false
|
|
dice.NgramSize = 1
|
|
finalRating = dice.Compare(*levMatch.OriginalValue, *levMatch.Value)
|
|
m.ScanSummaryLogger.LogComparison(lf, "Sorensen-Dice", *levMatch.Value, "Final rating", util.InlineSpewT(finalRating))
|
|
mediaMatch, found = m.MediaContainer.GetMediaFromTitleOrSynonym(levMatch.Value)
|
|
}
|
|
|
|
// After setting the mediaId, add the hook invocation
|
|
// Invoke ScanLocalFileMatched hook
|
|
event := &ScanLocalFileMatchedEvent{
|
|
LocalFile: lf,
|
|
Score: finalRating,
|
|
Match: mediaMatch,
|
|
Found: found,
|
|
}
|
|
hook.GlobalHookManager.OnScanLocalFileMatched().Trigger(event)
|
|
lf = event.LocalFile
|
|
mediaMatch = event.Match
|
|
found = event.Found
|
|
finalRating = event.Score
|
|
|
|
// Check if the hook overrode the match
|
|
if event.DefaultPrevented {
|
|
if m.ScanLogger != nil {
|
|
if mediaMatch != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Int("id", mediaMatch.ID).
|
|
Msg("Hook overrode match")
|
|
} else {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Msg("Hook overrode match, no match found")
|
|
}
|
|
}
|
|
if mediaMatch != nil {
|
|
lf.MediaId = mediaMatch.ID
|
|
} else {
|
|
lf.MediaId = 0
|
|
}
|
|
return
|
|
}
|
|
|
|
if !found {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.ErrorLevel).
|
|
Str("filename", lf.Name).
|
|
Msg("No media found from comparison result")
|
|
}
|
|
m.ScanSummaryLogger.LogFileNotMatched(lf, "No media found from comparison result")
|
|
return
|
|
}
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Str("title", mediaMatch.GetTitleSafe()).
|
|
Int("id", mediaMatch.ID).
|
|
Msg("Best match found")
|
|
}
|
|
|
|
if finalRating < m.Threshold {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Float64("rating", finalRating).
|
|
Float64("threshold", m.Threshold).
|
|
Msg("Best match Sorensen-Dice rating too low, un-matching file")
|
|
}
|
|
m.ScanSummaryLogger.LogFailedMatch(lf, "Rating too low, threshold is "+fmt.Sprintf("%f", m.Threshold))
|
|
return
|
|
}
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Str("filename", lf.Name).
|
|
Float64("rating", finalRating).
|
|
Float64("threshold", m.Threshold).
|
|
Msg("Best match rating high enough, matching file")
|
|
}
|
|
m.ScanSummaryLogger.LogSuccessfullyMatched(lf, mediaMatch.ID)
|
|
|
|
lf.MediaId = mediaMatch.ID
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------------------------------------
|
|
|
|
// validateMatches compares groups of local files' titles with the media titles and un-matches the local files that have a lower rating than the highest rating.
|
|
func (m *Matcher) validateMatches() {
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.InfoLevel).Msg("Validating matches")
|
|
}
|
|
|
|
// Group local files by media ID
|
|
groups := lop.GroupBy(m.LocalFiles, func(localFile *anime.LocalFile) int {
|
|
return localFile.MediaId
|
|
})
|
|
|
|
// Remove the group with unmatched media
|
|
delete(groups, 0)
|
|
|
|
// Un-match files with lower ratings
|
|
p := pool.New()
|
|
for mId, files := range groups {
|
|
p.Go(func() {
|
|
if len(files) > 0 {
|
|
m.validateMatchGroup(mId, files)
|
|
}
|
|
})
|
|
}
|
|
p.Wait()
|
|
|
|
}
|
|
|
|
// validateMatchGroup compares the local files' titles under the same media
|
|
// with the media titles and un-matches the local files that have a lower rating.
|
|
// This is done to try and filter out wrong matches.
|
|
func (m *Matcher) validateMatchGroup(mediaId int, lfs []*anime.LocalFile) {
|
|
|
|
media, found := m.MediaContainer.GetMediaFromId(mediaId)
|
|
if !found {
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.ErrorLevel).
|
|
Int("mediaId", mediaId).
|
|
Msg("Media not found in media container")
|
|
}
|
|
return
|
|
}
|
|
|
|
titles := media.GetAllTitles()
|
|
|
|
// Compare all files' parsed title with the media title
|
|
// Get the highest rating that will be used to un-match lower rated files
|
|
p := pool.NewWithResults[float64]()
|
|
for _, lf := range lfs {
|
|
p.Go(func() float64 {
|
|
t := lf.GetParsedTitle()
|
|
if comparison.ValueContainsSpecial(lf.Name) || comparison.ValueContainsNC(lf.Name) {
|
|
return 0
|
|
}
|
|
compRes, ok := comparison.FindBestMatchWithSorensenDice(&t, titles)
|
|
if ok {
|
|
return compRes.Rating
|
|
}
|
|
return 0
|
|
})
|
|
}
|
|
fileRatings := p.Wait()
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Int("mediaId", mediaId).
|
|
Any("fileRatings", fileRatings).
|
|
Msg("File ratings")
|
|
}
|
|
|
|
highestRating := lo.Reduce(fileRatings, func(prev float64, curr float64, _ int) float64 {
|
|
if prev > curr {
|
|
return prev
|
|
} else {
|
|
return curr
|
|
}
|
|
}, 0.0)
|
|
|
|
// Un-match files that have a lower rating than the ceiling
|
|
// UNLESS they are Special or NC
|
|
lop.ForEach(lfs, func(lf *anime.LocalFile, _ int) {
|
|
if !comparison.ValueContainsSpecial(lf.Name) && !comparison.ValueContainsNC(lf.Name) {
|
|
t := lf.GetParsedTitle()
|
|
if compRes, ok := comparison.FindBestMatchWithSorensenDice(&t, titles); ok {
|
|
// If the local file's rating is lower, un-match it
|
|
// Unless the difference is less than 0.7 (very lax since a lot of anime have very long names that can be truncated)
|
|
if compRes.Rating < highestRating && math.Abs(compRes.Rating-highestRating) > 0.7 {
|
|
lf.MediaId = 0
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.WarnLevel).
|
|
Int("mediaId", mediaId).
|
|
Str("filename", lf.Name).
|
|
Float64("rating", compRes.Rating).
|
|
Float64("highestRating", highestRating).
|
|
Msg("Rating does not match parameters, un-matching file")
|
|
}
|
|
m.ScanSummaryLogger.LogUnmatched(lf, fmt.Sprintf("Rating does not match parameters. File rating: %f, highest rating: %f", compRes.Rating, highestRating))
|
|
|
|
} else {
|
|
|
|
if m.ScanLogger != nil {
|
|
m.ScanLogger.LogMatcher(zerolog.DebugLevel).
|
|
Int("mediaId", mediaId).
|
|
Str("filename", lf.Name).
|
|
Float64("rating", compRes.Rating).
|
|
Float64("highestRating", highestRating).
|
|
Msg("Rating matches parameters, keeping file matched")
|
|
}
|
|
m.ScanSummaryLogger.LogMatchValidated(lf, mediaId)
|
|
|
|
}
|
|
}
|
|
}
|
|
})
|
|
|
|
}
|