interface TextMapping {
	cleanedText: string
	// positionMap[i] = index-in-original-text for the i-th character in cleanedText
	positionMap: number[]
}

/**
 * Returns the "cleaned" text and an array mapping each character position
 * in that cleaned text back to its position in the original text.
 */
export function cleanAndMapDocumentChunkText(original: string): TextMapping {
	const positionMap: number[] = []
	let cleanedText = ""

	let i = 0
	while (i < original.length) {
		// Remove hyphen followed by whitespace
		if (original[i] === "-" && i + 1 < original.length && /\s/.test(original[i + 1])) {
			// Skip hyphen and the following whitespace
			i += 2
			continue
		}

		// Replace newlines with spaces (if not redundant)
		if (original[i] === "\n" || original[i] === "\r") {
			// Only add a space if the last cleaned character wasn't a space
			if (cleanedText[cleanedText.length - 1] !== " ") {
				cleanedText += " "
				positionMap.push(i)
			}
			i++
			continue
		}

		// Default: copy the character and record the mapping
		cleanedText += original[i]
		positionMap.push(i)
		i++
	}

	// Optional: trim the final text
	const trimmedText = cleanedText.trim()
	const leftTrim = cleanedText.length - cleanedText.trimLeft().length
	const rightTrim = trimmedText.length
	const finalCleanedText = trimmedText
	const finalMap = positionMap.slice(leftTrim, leftTrim + rightTrim)

	return {
		cleanedText: finalCleanedText,
		positionMap: finalMap,
	}
}
