seanime-docker/seanime-2.9.10/internal/manga/providers/local_parser.go

package manga_providers

import (
	"path/filepath"
	"slices"
	"strconv"
	"strings"
	"unicode"
)

type ScannedChapterFile struct {
	Chapter      []string // can be a single chapter or a range of chapters
	MangaTitle   string   // typically comes before the chapter number
	ChapterTitle string   // typically comes after the chapter number
	Volume       []string // typically comes after the chapter number
	IsPDF        bool
}

type TokenType int

const (
	TokenUnknown TokenType = iota
	TokenText
	TokenNumber
	TokenKeyword
	TokenSeparator
	TokenEnclosed
	TokenFileExtension
)

// Token represents a parsed token from the filename
type Token struct {
	Type      TokenType
	Value     string
	Position  int
	IsChapter bool
	IsVolume  bool
}

// Lexer handles the tokenization of the filename
type Lexer struct {
	input        string
	position     int
	tokens       []Token
	currentToken int
}

var ChapterKeywords = []string{
	"ch", "chp", "chapter", "chap", "c",
}

var VolumeKeywords = []string{
	"v", "vol", "volume",
}

var SeparatorChars = []rune{
	' ', '-', '_', '.', '[', ']', '(', ')', '{', '}', '~',
}

var ImageExtensions = map[string]struct{}{
	".png":  {},
	".jpg":  {},
	".jpeg": {},
	".gif":  {},
	".webp": {},
	".bmp":  {},
	".tiff": {},
	".tif":  {},
}

// NewLexer creates a new lexer instance
func NewLexer(input string) *Lexer {
	return &Lexer{
		input:        strings.TrimSpace(input),
		tokens:       make([]Token, 0),
		currentToken: 0,
	}
}

// Tokenize breaks down the input into tokens
func (l *Lexer) Tokenize() []Token {
	l.position = 0
	l.tokens = make([]Token, 0)

	for l.position < len(l.input) {
		if l.isWhitespace(l.current()) {
			l.skipWhitespace()
			continue
		}

		if l.isEnclosedStart(l.current()) {
			l.readEnclosed()
			continue
		}

		if l.isSeparator(l.current()) {
			l.readSeparator()
			continue
		}

		if l.isDigit(l.current()) {
			l.readNumber()
			continue
		}

		if l.isLetter(l.current()) {
			l.readText()
			continue
		}

		// Skip unknown characters
		l.position++
	}

	l.classifyTokens()
	return l.tokens
}

// current returns the current character
func (l *Lexer) current() rune {
	if l.position >= len(l.input) {
		return 0
	}
	return rune(l.input[l.position])
}

// peek returns the next character without advancing
func (l *Lexer) peek() rune {
	if l.position+1 >= len(l.input) {
		return 0
	}
	return rune(l.input[l.position+1])
}

// advance moves to the next character
func (l *Lexer) advance() {
	l.position++
}

// isWhitespace checks if character is whitespace
func (l *Lexer) isWhitespace(r rune) bool {
	return r == ' ' || r == '\t' || r == '\n' || r == '\r'
}

// isSeparator checks if character is a separator
func (l *Lexer) isSeparator(r rune) bool {
	for _, sep := range SeparatorChars {
		if r == sep {
			return true
		}
	}
	return false
}

// isEnclosedStart checks if character starts an enclosed section
func (l *Lexer) isEnclosedStart(r rune) bool {
	return r == '[' || r == '(' || r == '{'
}

// isDigit checks if character is a digit
func (l *Lexer) isDigit(r rune) bool {
	return r >= '0' && r <= '9'
}

// isLetter checks if character is a letter
func (l *Lexer) isLetter(r rune) bool {
	return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}

// skipWhitespace skips all whitespace characters
func (l *Lexer) skipWhitespace() {
	for l.position < len(l.input) && l.isWhitespace(l.current()) {
		l.advance()
	}
}

// readEnclosed reads content within brackets/parentheses
func (l *Lexer) readEnclosed() {
	start := l.position
	openChar := l.current()
	var closeChar rune

	switch openChar {
	case '[':
		closeChar = ']'
	case '(':
		closeChar = ')'
	case '{':
		closeChar = '}'
	default:
		l.advance()
		return
	}

	l.advance() // Skip opening character
	startContent := l.position

	for l.position < len(l.input) && l.current() != closeChar {
		l.advance()
	}

	if l.position < len(l.input) {
		content := l.input[startContent:l.position]
		l.advance() // Skip closing character

		// Only add if content is meaningful
		if len(strings.TrimSpace(content)) > 0 {
			l.addToken(TokenEnclosed, content, start)
		}
	}
}

// readSeparator reads separator characters
func (l *Lexer) readSeparator() {
	start := l.position
	value := string(l.current())
	l.advance()
	l.addToken(TokenSeparator, value, start)
}

// readNumber reads numeric values (including decimals)
func (l *Lexer) readNumber() {
	start := l.position

	for l.position < len(l.input) && (l.isDigit(l.current()) || l.current() == '.') {
		// Stop if we hit a file extension
		if l.current() == '.' && l.position+1 < len(l.input) {
			// Check if this is followed by common file extensions
			remaining := l.input[l.position+1:]
			if strings.HasPrefix(remaining, "cbz") || strings.HasPrefix(remaining, "cbr") ||
				strings.HasPrefix(remaining, "pdf") || strings.HasPrefix(remaining, "epub") {
				break
			}
		}
		l.advance()
	}

	value := l.input[start:l.position]
	l.addToken(TokenNumber, value, start)
}

// readText reads alphabetic text
func (l *Lexer) readText() {
	start := l.position

	for l.position < len(l.input) && (l.isLetter(l.current()) || l.isDigit(l.current())) {
		l.advance()
	}

	value := l.input[start:l.position]
	lowerValue := strings.ToLower(value) // Use lowercase for keyword checking

	// Check if this might be a concatenated keyword that continues with a decimal
	if l.startsWithKeyword(lowerValue) && l.position < len(l.input) && l.current() == '.' {
		// Look ahead to see if there are more digits after the decimal
		tempPos := l.position + 1
		if tempPos < len(l.input) && l.isDigit(rune(l.input[tempPos])) {
			// Read the decimal part
			l.advance() // consume the '.'
			for l.position < len(l.input) && l.isDigit(l.current()) {
				l.advance()
			}
			// Update value to include decimal part
			value = l.input[start:l.position]
			lowerValue = strings.ToLower(value)
		}
	}

	// Check for concatenated keywords like "ch001", "c001", "chp001", "c12.5"
	if l.containsKeywordPrefix(lowerValue) {
		l.splitKeywordAndNumber(lowerValue, value, start) // Pass both versions
	} else {
		l.addToken(TokenText, value, start) // Use original case
	}
}

// startsWithKeyword checks if text starts with any known keyword
func (l *Lexer) startsWithKeyword(text string) bool {
	for _, keyword := range ChapterKeywords {
		if strings.HasPrefix(text, keyword) {
			return true
		}
	}
	for _, keyword := range VolumeKeywords {
		if strings.HasPrefix(text, keyword) {
			return true
		}
	}
	return false
}

// containsKeywordPrefix checks if text starts with a known keyword
func (l *Lexer) containsKeywordPrefix(text string) bool {
	chKeywords := ChapterKeywords
	// Sort by length descending to match longer keywords first
	slices.SortFunc(chKeywords, func(a, b string) int {
		return len(b) - len(a) // Sort by length descending
	})
	for _, keyword := range ChapterKeywords {
		if strings.HasPrefix(text, keyword) && len(text) > len(keyword) {
			remaining := text[len(keyword):]
			// Check if remaining part is numeric (including decimals)
			if len(remaining) == 0 {
				return false
			}
			return l.isValidNumberPart(remaining)
		}
	}
	for _, keyword := range VolumeKeywords {
		if strings.HasPrefix(text, keyword) && len(text) > len(keyword) {
			remaining := text[len(keyword):]
			// Check if remaining part is numeric (including decimals)
			if len(remaining) == 0 {
				return false
			}
			return l.isValidNumberPart(remaining)
		}
	}
	return false
}

// isValidNumberPart checks if string is valid number (including decimals)
func (l *Lexer) isValidNumberPart(s string) bool {
	if len(s) == 0 {
		return false
	}

	// Don't allow starting with decimal
	if s[0] == '.' {
		return false
	}

	hasDecimal := false
	for _, r := range s {
		if r == '.' {
			if hasDecimal {
				return false // Multiple decimals not allowed
			}
			hasDecimal = true
		} else if !l.isDigit(r) {
			return false
		}
	}
	return true
}

// splitKeywordAndNumber splits concatenated keyword and number tokens
func (l *Lexer) splitKeywordAndNumber(lowerText, originalText string, position int) {
	for _, keyword := range ChapterKeywords {
		if strings.HasPrefix(lowerText, keyword) && len(lowerText) > len(keyword) {
			// Use original case for the keyword part
			originalKeyword := originalText[:len(keyword)]
			l.addKeywordToken(originalKeyword, position, true, false)

			// Extract number part (keeping original case/formatting)
			numberPart := originalText[len(keyword):]
			l.addToken(TokenNumber, numberPart, position+len(keyword))
			return
		}
	}
	for _, keyword := range VolumeKeywords {
		if strings.HasPrefix(lowerText, keyword) && len(lowerText) > len(keyword) {
			// Use original case for the keyword part
			originalKeyword := originalText[:len(keyword)]
			l.addKeywordToken(originalKeyword, position, false, true)

			// Extract number part (keeping original case/formatting)
			numberPart := originalText[len(keyword):]
			l.addToken(TokenNumber, numberPart, position+len(keyword))
			return
		}
	}
}

// addKeywordToken adds a keyword token with flags
func (l *Lexer) addKeywordToken(value string, position int, isChapter, isVolume bool) {
	l.tokens = append(l.tokens, Token{
		Type:      TokenKeyword,
		Value:     value,
		Position:  position,
		IsChapter: isChapter,
		IsVolume:  isVolume,
	})
}

// addToken adds a token to the list
func (l *Lexer) addToken(tokenType TokenType, value string, position int) {
	l.tokens = append(l.tokens, Token{
		Type:     tokenType,
		Value:    value,
		Position: position,
	})
}

// classifyTokens identifies chapter and volume keywords
func (l *Lexer) classifyTokens() {
	for i := range l.tokens {
		token := &l.tokens[i]

		// Check for chapter keywords (case insensitive)
		lowerValue := strings.ToLower(token.Value)
		for _, keyword := range ChapterKeywords {
			if lowerValue == keyword {
				token.Type = TokenKeyword
				token.IsChapter = true
				break
			}
		}

		// Check for volume keywords (case insensitive)
		for _, keyword := range VolumeKeywords {
			if lowerValue == keyword {
				token.Type = TokenKeyword
				token.IsVolume = true
				break
			}
		}

		// Check for file extensions
		if strings.Contains(lowerValue, "pdf") || strings.Contains(lowerValue, "cbz") ||
			strings.Contains(lowerValue, "cbr") || strings.Contains(lowerValue, "epub") {
			token.Type = TokenFileExtension
		}
	}
}

// Parser handles the semantic analysis of tokens
type Parser struct {
	tokens []Token
	result *ScannedChapterFile
}

// NewParser creates a new parser instance
func NewParser(tokens []Token) *Parser {
	return &Parser{
		tokens: tokens,
		result: &ScannedChapterFile{
			Chapter: make([]string, 0),
			Volume:  make([]string, 0),
		},
	}
}

// Parse performs semantic analysis on the tokens
func (p *Parser) Parse() *ScannedChapterFile {
	p.extractChapters()
	p.extractVolumes()
	p.extractTitles()
	p.checkPDF()

	return p.result
}

// extractChapters finds and extracts chapter numbers
func (p *Parser) extractChapters() {
	for i, token := range p.tokens {
		if token.IsChapter {
			// Look for numbers after chapter keyword
			for j := i + 1; j < len(p.tokens) && j < i+3; j++ {
				nextToken := p.tokens[j]
				if nextToken.Type == TokenNumber {
					p.addChapterNumber(nextToken.Value)
					break
				} else if nextToken.Type == TokenSeparator {
					continue
				} else {
					break
				}
			}
		} else if token.Type == TokenNumber && !token.IsVolume {
			// Standalone number might be a chapter
			if p.isLikelyChapterNumber(token, i) {
				p.addChapterNumber(token.Value)
			}
		}
	}

	// Handle ranges by looking for dash-separated numbers
	p.handleChapterRanges()
}

// handleChapterRanges processes chapter ranges like "1-2" or "001-002"
func (p *Parser) handleChapterRanges() {
	for i := 0; i < len(p.tokens)-2; i++ {
		if p.tokens[i].Type == TokenNumber &&
			p.tokens[i+1].Type == TokenSeparator && p.tokens[i+1].Value == "-" &&
			p.tokens[i+2].Type == TokenNumber {

			// Check if first number is already a chapter
			firstIsChapter := false
			for _, ch := range p.result.Chapter {
				if ch == p.tokens[i].Value {
					firstIsChapter = true
					break
				}
			}

			if firstIsChapter {
				// Add the second number as a chapter too
				p.result.Chapter = append(p.result.Chapter, p.tokens[i+2].Value)
			}
		}
	}
}

// extractVolumes finds and extracts volume numbers
func (p *Parser) extractVolumes() {
	for i, token := range p.tokens {
		if token.IsVolume {
			// Look for numbers after volume keyword
			for j := i + 1; j < len(p.tokens) && j < i+3; j++ {
				nextToken := p.tokens[j]
				if nextToken.Type == TokenNumber {
					p.result.Volume = append(p.result.Volume, nextToken.Value)
					break
				} else if nextToken.Type == TokenSeparator {
					continue
				} else {
					break
				}
			}
		}
	}
}

// extractTitles finds manga title and chapter title
func (p *Parser) extractTitles() {
	// Find first chapter keyword or number position
	chapterPos := -1
	for i, token := range p.tokens {
		if token.IsChapter || (token.Type == TokenNumber && p.isLikelyChapterNumber(token, i)) {
			chapterPos = i
			break
		}
	}

	if chapterPos > 0 {
		// Everything before chapter is likely manga title
		titleParts := make([]string, 0)
		for i := 0; i < chapterPos; i++ {
			token := p.tokens[i]
			if token.Type == TokenText && !token.IsVolume && !p.isIgnoredToken(token) {
				titleParts = append(titleParts, token.Value)
			} else if token.Type == TokenNumber && p.isNumberInTitle(token, i, chapterPos) {
				// Include numbers that are part of the title (but not volume indicators)
				titleParts = append(titleParts, token.Value)
			}
		}
		if len(titleParts) > 0 {
			p.result.MangaTitle = strings.Join(titleParts, " ")
		}

		// Look for chapter title after chapter number
		p.extractChapterTitle(chapterPos)
	} else {
		// No clear chapter indicator, check if this is a "number - title" pattern
		if len(p.result.Chapter) > 0 && p.hasChapterTitlePattern() {
			p.extractChapterTitleFromPattern()
		} else {
			// Treat most text as manga title
			p.extractFallbackTitle()
		}
	}
}

// hasChapterTitlePattern checks for "number - title" pattern
func (p *Parser) hasChapterTitlePattern() bool {
	for i := 0; i < len(p.tokens)-2; i++ {
		if p.tokens[i].Type == TokenNumber &&
			p.tokens[i+1].Type == TokenSeparator && p.tokens[i+1].Value == "-" &&
			i+2 < len(p.tokens) && p.tokens[i+2].Type == TokenText {
			return true
		}
	}
	return false
}

// extractChapterTitleFromPattern extracts title from "number - title" pattern
func (p *Parser) extractChapterTitleFromPattern() {
	for i := 0; i < len(p.tokens)-2; i++ {
		if p.tokens[i].Type == TokenNumber &&
			p.tokens[i+1].Type == TokenSeparator && p.tokens[i+1].Value == "-" {

			// Collect text after the dash
			titleParts := make([]string, 0)
			for j := i + 2; j < len(p.tokens); j++ {
				token := p.tokens[j]
				if token.Type == TokenText && !p.isIgnoredToken(token) {
					titleParts = append(titleParts, token.Value)
				} else if token.Type == TokenFileExtension {
					break
				}
			}
			if len(titleParts) > 0 {
				p.result.ChapterTitle = strings.Join(titleParts, " ")
			}
			break
		}
	}
}

// extractFallbackTitle extracts title when no clear chapter indicators
func (p *Parser) extractFallbackTitle() {
	titleParts := make([]string, 0)
	for _, token := range p.tokens {
		if token.Type == TokenText && !p.isIgnoredToken(token) {
			titleParts = append(titleParts, token.Value)
		}
	}
	if len(titleParts) > 0 {
		p.result.MangaTitle = strings.Join(titleParts, " ")
	}
}

// addChapterNumber adds a chapter number, handling ranges
func (p *Parser) addChapterNumber(value string) {
	// Check for range indicators in the surrounding tokens
	if strings.Contains(value, "-") {
		parts := strings.Split(value, "-")
		for _, part := range parts {
			if part != "" {
				p.result.Chapter = append(p.result.Chapter, strings.TrimSpace(part))
			}
		}
	} else {
		p.result.Chapter = append(p.result.Chapter, value)
	}
}

// isLikelyChapterNumber determines if a number token is likely a chapter
func (p *Parser) isLikelyChapterNumber(token Token, position int) bool {
	// If we already have chapters from keywords, be more strict
	if len(p.result.Chapter) > 0 {
		return false
	}

	// Check context - numbers at the start of filename are likely chapters
	if position < 3 {
		return true
	}

	// Check if preceded by common patterns
	if position > 0 {
		prevToken := p.tokens[position-1]
		if prevToken.Type == TokenSeparator && (prevToken.Value == "-" || prevToken.Value == " ") {
			return true
		}
	}

	return false
}

// isNumberInTitle determines if a number token should be part of the title
func (p *Parser) isNumberInTitle(token Token, position int, chapterPos int) bool {
	// Don't include numbers that are right before the chapter position
	if position == chapterPos-1 {
		return false
	}

	// Check if this number looks like it's associated with volume
	if position > 0 {
		prevToken := p.tokens[position-1]
		if prevToken.IsVolume {
			return false // This number belongs to volume
		}
	}

	// Small numbers (like 05, 2) that appear early in the title are likely part of title
	if position < 5 {
		if val := token.Value; len(val) <= 2 {
			// Check if this number looks like part of a title (e.g., "Title 05")
			return true
		}
	}
	return false
}

// isIgnoredToken checks if token should be ignored in titles
func (p *Parser) isIgnoredToken(token Token) bool {
	ignoredWords := []string{"digital", "group", "scan", "scans", "team", "raw", "raws"}
	for _, word := range ignoredWords {
		if token.Value == word {
			return true
		}
	}

	// Check for version indicators that shouldn't be in volume
	if strings.HasPrefix(token.Value, "v") && len(token.Value) > 1 {
		remaining := token.Value[1:]
		// If it's just "v" + digit, it might be version, not volume
		if len(remaining) > 0 && remaining[0] >= '0' && remaining[0] <= '9' {
			// Check context - if preceded by a number, it's likely a version
			return true
		}
	}

	return false
}

// checkPDF sets the PDF flag if file is a PDF
func (p *Parser) checkPDF() {
	for _, token := range p.tokens {
		if token.Type == TokenFileExtension && strings.Contains(token.Value, "pdf") {
			p.result.IsPDF = true
			break
		}
	}
}

// scanChapterFilename scans the filename and returns a chapter entry if it is a chapter.
func scanChapterFilename(filename string) (res *ScannedChapterFile, ok bool) {
	// Create lexer and tokenize
	lexer := NewLexer(filename)
	tokens := lexer.Tokenize()

	// Create parser and parse
	parser := NewParser(tokens)
	res = parser.Parse()

	return res, true
}

func isFileImage(filename string) bool {
	ext := strings.ToLower(filepath.Ext(filename))
	_, ok := ImageExtensions[ext]
	return ok
}

// extractChapterTitle finds chapter title after chapter number
func (p *Parser) extractChapterTitle(startPos int) {
	// Skip to after chapter number
	numberPos := -1
	for i := startPos; i < len(p.tokens); i++ {
		if p.tokens[i].Type == TokenNumber {
			numberPos = i
			break
		}
	}

	if numberPos == -1 {
		return
	}

	// Look for dash separator followed by text
	for i := numberPos + 1; i < len(p.tokens); i++ {
		token := p.tokens[i]
		if token.Type == TokenSeparator && token.Value == "-" {
			// Found dash, collect text after it
			titleParts := make([]string, 0)
			for j := i + 1; j < len(p.tokens); j++ {
				nextToken := p.tokens[j]
				if nextToken.Type == TokenText && !p.isIgnoredToken(nextToken) {
					titleParts = append(titleParts, nextToken.Value)
				} else if nextToken.Type == TokenFileExtension {
					break
				}
			}
			if len(titleParts) > 0 {
				p.result.ChapterTitle = strings.Join(titleParts, " ")
			}
			break
		}
	}
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////

type ScannedPageFile struct {
	Number   float64
	Filename string
	Ext      string
}

func parsePageFilename(filename string) (res *ScannedPageFile, ok bool) {
	res = &ScannedPageFile{
		Filename: filename,
	}

	filename = strings.ToLower(filename)
	res.Ext = filepath.Ext(filename)
	filename = strings.TrimSuffix(filename, res.Ext)

	if len(filename) == 0 {
		return res, false
	}

	// Find number at the start
	// check if first rune is a digit
	numStr := ""
	if !unicode.IsDigit(rune(filename[0])) {
		// walk until non-digit
		for i := 0; i < len(filename); i++ {
			if !unicode.IsDigit(rune(filename[i])) && rune(filename[i]) != '.' {
				break
			}
			numStr += string(filename[i])
		}
		if len(numStr) > 0 {
			res.Number, _ = strconv.ParseFloat(numStr, 64)
			return res, true
		}
	}

	// walk until first digit
	numStr = ""
	firstDigitIdx := strings.IndexFunc(filename, unicode.IsDigit)
	if firstDigitIdx != -1 {
		numStr += string(filename[firstDigitIdx])
		// walk until first non-digit or end
		for i := firstDigitIdx + 1; i < len(filename); i++ {
			if !unicode.IsDigit(rune(filename[i])) && rune(filename[i]) != '.' {
				break
			}
			numStr += string(filename[i])
		}
		if len(numStr) > 0 {
			res.Number, _ = strconv.ParseFloat(numStr, 64)
			return res, true
		}
	}

	return res, false
}