Files
seanime-docker/seanime-2.9.10/internal/manga/providers/local_parser.go
2025-09-20 14:08:38 +01:00

824 lines
21 KiB
Go

package manga_providers
import (
"path/filepath"
"slices"
"strconv"
"strings"
"unicode"
)
type ScannedChapterFile struct {
Chapter []string // can be a single chapter or a range of chapters
MangaTitle string // typically comes before the chapter number
ChapterTitle string // typically comes after the chapter number
Volume []string // typically comes after the chapter number
IsPDF bool
}
type TokenType int
const (
TokenUnknown TokenType = iota
TokenText
TokenNumber
TokenKeyword
TokenSeparator
TokenEnclosed
TokenFileExtension
)
// Token represents a parsed token from the filename
type Token struct {
Type TokenType
Value string
Position int
IsChapter bool
IsVolume bool
}
// Lexer handles the tokenization of the filename
type Lexer struct {
input string
position int
tokens []Token
currentToken int
}
var ChapterKeywords = []string{
"ch", "chp", "chapter", "chap", "c",
}
var VolumeKeywords = []string{
"v", "vol", "volume",
}
var SeparatorChars = []rune{
' ', '-', '_', '.', '[', ']', '(', ')', '{', '}', '~',
}
var ImageExtensions = map[string]struct{}{
".png": {},
".jpg": {},
".jpeg": {},
".gif": {},
".webp": {},
".bmp": {},
".tiff": {},
".tif": {},
}
// NewLexer creates a new lexer instance
func NewLexer(input string) *Lexer {
return &Lexer{
input: strings.TrimSpace(input),
tokens: make([]Token, 0),
currentToken: 0,
}
}
// Tokenize breaks down the input into tokens
func (l *Lexer) Tokenize() []Token {
l.position = 0
l.tokens = make([]Token, 0)
for l.position < len(l.input) {
if l.isWhitespace(l.current()) {
l.skipWhitespace()
continue
}
if l.isEnclosedStart(l.current()) {
l.readEnclosed()
continue
}
if l.isSeparator(l.current()) {
l.readSeparator()
continue
}
if l.isDigit(l.current()) {
l.readNumber()
continue
}
if l.isLetter(l.current()) {
l.readText()
continue
}
// Skip unknown characters
l.position++
}
l.classifyTokens()
return l.tokens
}
// current returns the current character
func (l *Lexer) current() rune {
if l.position >= len(l.input) {
return 0
}
return rune(l.input[l.position])
}
// peek returns the next character without advancing
func (l *Lexer) peek() rune {
if l.position+1 >= len(l.input) {
return 0
}
return rune(l.input[l.position+1])
}
// advance moves to the next character
func (l *Lexer) advance() {
l.position++
}
// isWhitespace checks if character is whitespace
func (l *Lexer) isWhitespace(r rune) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'
}
// isSeparator checks if character is a separator
func (l *Lexer) isSeparator(r rune) bool {
for _, sep := range SeparatorChars {
if r == sep {
return true
}
}
return false
}
// isEnclosedStart checks if character starts an enclosed section
func (l *Lexer) isEnclosedStart(r rune) bool {
return r == '[' || r == '(' || r == '{'
}
// isDigit checks if character is a digit
func (l *Lexer) isDigit(r rune) bool {
return r >= '0' && r <= '9'
}
// isLetter checks if character is a letter
func (l *Lexer) isLetter(r rune) bool {
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}
// skipWhitespace skips all whitespace characters
func (l *Lexer) skipWhitespace() {
for l.position < len(l.input) && l.isWhitespace(l.current()) {
l.advance()
}
}
// readEnclosed reads content within brackets/parentheses
func (l *Lexer) readEnclosed() {
start := l.position
openChar := l.current()
var closeChar rune
switch openChar {
case '[':
closeChar = ']'
case '(':
closeChar = ')'
case '{':
closeChar = '}'
default:
l.advance()
return
}
l.advance() // Skip opening character
startContent := l.position
for l.position < len(l.input) && l.current() != closeChar {
l.advance()
}
if l.position < len(l.input) {
content := l.input[startContent:l.position]
l.advance() // Skip closing character
// Only add if content is meaningful
if len(strings.TrimSpace(content)) > 0 {
l.addToken(TokenEnclosed, content, start)
}
}
}
// readSeparator reads separator characters
func (l *Lexer) readSeparator() {
start := l.position
value := string(l.current())
l.advance()
l.addToken(TokenSeparator, value, start)
}
// readNumber reads numeric values (including decimals)
func (l *Lexer) readNumber() {
start := l.position
for l.position < len(l.input) && (l.isDigit(l.current()) || l.current() == '.') {
// Stop if we hit a file extension
if l.current() == '.' && l.position+1 < len(l.input) {
// Check if this is followed by common file extensions
remaining := l.input[l.position+1:]
if strings.HasPrefix(remaining, "cbz") || strings.HasPrefix(remaining, "cbr") ||
strings.HasPrefix(remaining, "pdf") || strings.HasPrefix(remaining, "epub") {
break
}
}
l.advance()
}
value := l.input[start:l.position]
l.addToken(TokenNumber, value, start)
}
// readText reads alphabetic text
func (l *Lexer) readText() {
start := l.position
for l.position < len(l.input) && (l.isLetter(l.current()) || l.isDigit(l.current())) {
l.advance()
}
value := l.input[start:l.position]
lowerValue := strings.ToLower(value) // Use lowercase for keyword checking
// Check if this might be a concatenated keyword that continues with a decimal
if l.startsWithKeyword(lowerValue) && l.position < len(l.input) && l.current() == '.' {
// Look ahead to see if there are more digits after the decimal
tempPos := l.position + 1
if tempPos < len(l.input) && l.isDigit(rune(l.input[tempPos])) {
// Read the decimal part
l.advance() // consume the '.'
for l.position < len(l.input) && l.isDigit(l.current()) {
l.advance()
}
// Update value to include decimal part
value = l.input[start:l.position]
lowerValue = strings.ToLower(value)
}
}
// Check for concatenated keywords like "ch001", "c001", "chp001", "c12.5"
if l.containsKeywordPrefix(lowerValue) {
l.splitKeywordAndNumber(lowerValue, value, start) // Pass both versions
} else {
l.addToken(TokenText, value, start) // Use original case
}
}
// startsWithKeyword checks if text starts with any known keyword
func (l *Lexer) startsWithKeyword(text string) bool {
for _, keyword := range ChapterKeywords {
if strings.HasPrefix(text, keyword) {
return true
}
}
for _, keyword := range VolumeKeywords {
if strings.HasPrefix(text, keyword) {
return true
}
}
return false
}
// containsKeywordPrefix checks if text starts with a known keyword
func (l *Lexer) containsKeywordPrefix(text string) bool {
chKeywords := ChapterKeywords
// Sort by length descending to match longer keywords first
slices.SortFunc(chKeywords, func(a, b string) int {
return len(b) - len(a) // Sort by length descending
})
for _, keyword := range ChapterKeywords {
if strings.HasPrefix(text, keyword) && len(text) > len(keyword) {
remaining := text[len(keyword):]
// Check if remaining part is numeric (including decimals)
if len(remaining) == 0 {
return false
}
return l.isValidNumberPart(remaining)
}
}
for _, keyword := range VolumeKeywords {
if strings.HasPrefix(text, keyword) && len(text) > len(keyword) {
remaining := text[len(keyword):]
// Check if remaining part is numeric (including decimals)
if len(remaining) == 0 {
return false
}
return l.isValidNumberPart(remaining)
}
}
return false
}
// isValidNumberPart checks if string is valid number (including decimals)
func (l *Lexer) isValidNumberPart(s string) bool {
if len(s) == 0 {
return false
}
// Don't allow starting with decimal
if s[0] == '.' {
return false
}
hasDecimal := false
for _, r := range s {
if r == '.' {
if hasDecimal {
return false // Multiple decimals not allowed
}
hasDecimal = true
} else if !l.isDigit(r) {
return false
}
}
return true
}
// splitKeywordAndNumber splits concatenated keyword and number tokens
func (l *Lexer) splitKeywordAndNumber(lowerText, originalText string, position int) {
for _, keyword := range ChapterKeywords {
if strings.HasPrefix(lowerText, keyword) && len(lowerText) > len(keyword) {
// Use original case for the keyword part
originalKeyword := originalText[:len(keyword)]
l.addKeywordToken(originalKeyword, position, true, false)
// Extract number part (keeping original case/formatting)
numberPart := originalText[len(keyword):]
l.addToken(TokenNumber, numberPart, position+len(keyword))
return
}
}
for _, keyword := range VolumeKeywords {
if strings.HasPrefix(lowerText, keyword) && len(lowerText) > len(keyword) {
// Use original case for the keyword part
originalKeyword := originalText[:len(keyword)]
l.addKeywordToken(originalKeyword, position, false, true)
// Extract number part (keeping original case/formatting)
numberPart := originalText[len(keyword):]
l.addToken(TokenNumber, numberPart, position+len(keyword))
return
}
}
}
// addKeywordToken adds a keyword token with flags
func (l *Lexer) addKeywordToken(value string, position int, isChapter, isVolume bool) {
l.tokens = append(l.tokens, Token{
Type: TokenKeyword,
Value: value,
Position: position,
IsChapter: isChapter,
IsVolume: isVolume,
})
}
// addToken adds a token to the list
func (l *Lexer) addToken(tokenType TokenType, value string, position int) {
l.tokens = append(l.tokens, Token{
Type: tokenType,
Value: value,
Position: position,
})
}
// classifyTokens identifies chapter and volume keywords
func (l *Lexer) classifyTokens() {
for i := range l.tokens {
token := &l.tokens[i]
// Check for chapter keywords (case insensitive)
lowerValue := strings.ToLower(token.Value)
for _, keyword := range ChapterKeywords {
if lowerValue == keyword {
token.Type = TokenKeyword
token.IsChapter = true
break
}
}
// Check for volume keywords (case insensitive)
for _, keyword := range VolumeKeywords {
if lowerValue == keyword {
token.Type = TokenKeyword
token.IsVolume = true
break
}
}
// Check for file extensions
if strings.Contains(lowerValue, "pdf") || strings.Contains(lowerValue, "cbz") ||
strings.Contains(lowerValue, "cbr") || strings.Contains(lowerValue, "epub") {
token.Type = TokenFileExtension
}
}
}
// Parser handles the semantic analysis of tokens
type Parser struct {
tokens []Token
result *ScannedChapterFile
}
// NewParser creates a new parser instance
func NewParser(tokens []Token) *Parser {
return &Parser{
tokens: tokens,
result: &ScannedChapterFile{
Chapter: make([]string, 0),
Volume: make([]string, 0),
},
}
}
// Parse performs semantic analysis on the tokens
func (p *Parser) Parse() *ScannedChapterFile {
p.extractChapters()
p.extractVolumes()
p.extractTitles()
p.checkPDF()
return p.result
}
// extractChapters finds and extracts chapter numbers
func (p *Parser) extractChapters() {
for i, token := range p.tokens {
if token.IsChapter {
// Look for numbers after chapter keyword
for j := i + 1; j < len(p.tokens) && j < i+3; j++ {
nextToken := p.tokens[j]
if nextToken.Type == TokenNumber {
p.addChapterNumber(nextToken.Value)
break
} else if nextToken.Type == TokenSeparator {
continue
} else {
break
}
}
} else if token.Type == TokenNumber && !token.IsVolume {
// Standalone number might be a chapter
if p.isLikelyChapterNumber(token, i) {
p.addChapterNumber(token.Value)
}
}
}
// Handle ranges by looking for dash-separated numbers
p.handleChapterRanges()
}
// handleChapterRanges processes chapter ranges like "1-2" or "001-002"
func (p *Parser) handleChapterRanges() {
for i := 0; i < len(p.tokens)-2; i++ {
if p.tokens[i].Type == TokenNumber &&
p.tokens[i+1].Type == TokenSeparator && p.tokens[i+1].Value == "-" &&
p.tokens[i+2].Type == TokenNumber {
// Check if first number is already a chapter
firstIsChapter := false
for _, ch := range p.result.Chapter {
if ch == p.tokens[i].Value {
firstIsChapter = true
break
}
}
if firstIsChapter {
// Add the second number as a chapter too
p.result.Chapter = append(p.result.Chapter, p.tokens[i+2].Value)
}
}
}
}
// extractVolumes finds and extracts volume numbers
func (p *Parser) extractVolumes() {
for i, token := range p.tokens {
if token.IsVolume {
// Look for numbers after volume keyword
for j := i + 1; j < len(p.tokens) && j < i+3; j++ {
nextToken := p.tokens[j]
if nextToken.Type == TokenNumber {
p.result.Volume = append(p.result.Volume, nextToken.Value)
break
} else if nextToken.Type == TokenSeparator {
continue
} else {
break
}
}
}
}
}
// extractTitles finds manga title and chapter title
func (p *Parser) extractTitles() {
// Find first chapter keyword or number position
chapterPos := -1
for i, token := range p.tokens {
if token.IsChapter || (token.Type == TokenNumber && p.isLikelyChapterNumber(token, i)) {
chapterPos = i
break
}
}
if chapterPos > 0 {
// Everything before chapter is likely manga title
titleParts := make([]string, 0)
for i := 0; i < chapterPos; i++ {
token := p.tokens[i]
if token.Type == TokenText && !token.IsVolume && !p.isIgnoredToken(token) {
titleParts = append(titleParts, token.Value)
} else if token.Type == TokenNumber && p.isNumberInTitle(token, i, chapterPos) {
// Include numbers that are part of the title (but not volume indicators)
titleParts = append(titleParts, token.Value)
}
}
if len(titleParts) > 0 {
p.result.MangaTitle = strings.Join(titleParts, " ")
}
// Look for chapter title after chapter number
p.extractChapterTitle(chapterPos)
} else {
// No clear chapter indicator, check if this is a "number - title" pattern
if len(p.result.Chapter) > 0 && p.hasChapterTitlePattern() {
p.extractChapterTitleFromPattern()
} else {
// Treat most text as manga title
p.extractFallbackTitle()
}
}
}
// hasChapterTitlePattern checks for "number - title" pattern
func (p *Parser) hasChapterTitlePattern() bool {
for i := 0; i < len(p.tokens)-2; i++ {
if p.tokens[i].Type == TokenNumber &&
p.tokens[i+1].Type == TokenSeparator && p.tokens[i+1].Value == "-" &&
i+2 < len(p.tokens) && p.tokens[i+2].Type == TokenText {
return true
}
}
return false
}
// extractChapterTitleFromPattern extracts title from "number - title" pattern
func (p *Parser) extractChapterTitleFromPattern() {
for i := 0; i < len(p.tokens)-2; i++ {
if p.tokens[i].Type == TokenNumber &&
p.tokens[i+1].Type == TokenSeparator && p.tokens[i+1].Value == "-" {
// Collect text after the dash
titleParts := make([]string, 0)
for j := i + 2; j < len(p.tokens); j++ {
token := p.tokens[j]
if token.Type == TokenText && !p.isIgnoredToken(token) {
titleParts = append(titleParts, token.Value)
} else if token.Type == TokenFileExtension {
break
}
}
if len(titleParts) > 0 {
p.result.ChapterTitle = strings.Join(titleParts, " ")
}
break
}
}
}
// extractFallbackTitle extracts title when no clear chapter indicators
func (p *Parser) extractFallbackTitle() {
titleParts := make([]string, 0)
for _, token := range p.tokens {
if token.Type == TokenText && !p.isIgnoredToken(token) {
titleParts = append(titleParts, token.Value)
}
}
if len(titleParts) > 0 {
p.result.MangaTitle = strings.Join(titleParts, " ")
}
}
// addChapterNumber adds a chapter number, handling ranges
func (p *Parser) addChapterNumber(value string) {
// Check for range indicators in the surrounding tokens
if strings.Contains(value, "-") {
parts := strings.Split(value, "-")
for _, part := range parts {
if part != "" {
p.result.Chapter = append(p.result.Chapter, strings.TrimSpace(part))
}
}
} else {
p.result.Chapter = append(p.result.Chapter, value)
}
}
// isLikelyChapterNumber determines if a number token is likely a chapter
func (p *Parser) isLikelyChapterNumber(token Token, position int) bool {
// If we already have chapters from keywords, be more strict
if len(p.result.Chapter) > 0 {
return false
}
// Check context - numbers at the start of filename are likely chapters
if position < 3 {
return true
}
// Check if preceded by common patterns
if position > 0 {
prevToken := p.tokens[position-1]
if prevToken.Type == TokenSeparator && (prevToken.Value == "-" || prevToken.Value == " ") {
return true
}
}
return false
}
// isNumberInTitle determines if a number token should be part of the title
func (p *Parser) isNumberInTitle(token Token, position int, chapterPos int) bool {
// Don't include numbers that are right before the chapter position
if position == chapterPos-1 {
return false
}
// Check if this number looks like it's associated with volume
if position > 0 {
prevToken := p.tokens[position-1]
if prevToken.IsVolume {
return false // This number belongs to volume
}
}
// Small numbers (like 05, 2) that appear early in the title are likely part of title
if position < 5 {
if val := token.Value; len(val) <= 2 {
// Check if this number looks like part of a title (e.g., "Title 05")
return true
}
}
return false
}
// isIgnoredToken checks if token should be ignored in titles
func (p *Parser) isIgnoredToken(token Token) bool {
ignoredWords := []string{"digital", "group", "scan", "scans", "team", "raw", "raws"}
for _, word := range ignoredWords {
if token.Value == word {
return true
}
}
// Check for version indicators that shouldn't be in volume
if strings.HasPrefix(token.Value, "v") && len(token.Value) > 1 {
remaining := token.Value[1:]
// If it's just "v" + digit, it might be version, not volume
if len(remaining) > 0 && remaining[0] >= '0' && remaining[0] <= '9' {
// Check context - if preceded by a number, it's likely a version
return true
}
}
return false
}
// checkPDF sets the PDF flag if file is a PDF
func (p *Parser) checkPDF() {
for _, token := range p.tokens {
if token.Type == TokenFileExtension && strings.Contains(token.Value, "pdf") {
p.result.IsPDF = true
break
}
}
}
// scanChapterFilename scans the filename and returns a chapter entry if it is a chapter.
func scanChapterFilename(filename string) (res *ScannedChapterFile, ok bool) {
// Create lexer and tokenize
lexer := NewLexer(filename)
tokens := lexer.Tokenize()
// Create parser and parse
parser := NewParser(tokens)
res = parser.Parse()
return res, true
}
func isFileImage(filename string) bool {
ext := strings.ToLower(filepath.Ext(filename))
_, ok := ImageExtensions[ext]
return ok
}
// extractChapterTitle finds chapter title after chapter number
func (p *Parser) extractChapterTitle(startPos int) {
// Skip to after chapter number
numberPos := -1
for i := startPos; i < len(p.tokens); i++ {
if p.tokens[i].Type == TokenNumber {
numberPos = i
break
}
}
if numberPos == -1 {
return
}
// Look for dash separator followed by text
for i := numberPos + 1; i < len(p.tokens); i++ {
token := p.tokens[i]
if token.Type == TokenSeparator && token.Value == "-" {
// Found dash, collect text after it
titleParts := make([]string, 0)
for j := i + 1; j < len(p.tokens); j++ {
nextToken := p.tokens[j]
if nextToken.Type == TokenText && !p.isIgnoredToken(nextToken) {
titleParts = append(titleParts, nextToken.Value)
} else if nextToken.Type == TokenFileExtension {
break
}
}
if len(titleParts) > 0 {
p.result.ChapterTitle = strings.Join(titleParts, " ")
}
break
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
type ScannedPageFile struct {
Number float64
Filename string
Ext string
}
func parsePageFilename(filename string) (res *ScannedPageFile, ok bool) {
res = &ScannedPageFile{
Filename: filename,
}
filename = strings.ToLower(filename)
res.Ext = filepath.Ext(filename)
filename = strings.TrimSuffix(filename, res.Ext)
if len(filename) == 0 {
return res, false
}
// Find number at the start
// check if first rune is a digit
numStr := ""
if !unicode.IsDigit(rune(filename[0])) {
// walk until non-digit
for i := 0; i < len(filename); i++ {
if !unicode.IsDigit(rune(filename[i])) && rune(filename[i]) != '.' {
break
}
numStr += string(filename[i])
}
if len(numStr) > 0 {
res.Number, _ = strconv.ParseFloat(numStr, 64)
return res, true
}
}
// walk until first digit
numStr = ""
firstDigitIdx := strings.IndexFunc(filename, unicode.IsDigit)
if firstDigitIdx != -1 {
numStr += string(filename[firstDigitIdx])
// walk until first non-digit or end
for i := firstDigitIdx + 1; i < len(filename); i++ {
if !unicode.IsDigit(rune(filename[i])) && rune(filename[i]) != '.' {
break
}
numStr += string(filename[i])
}
if len(numStr) > 0 {
res.Number, _ = strconv.ParseFloat(numStr, 64)
return res, true
}
}
return res, false
}