pull:初次提交

2025-09-08 04:48:28 +08:00
parent 5c0619656d
commit f64f498365
11751 changed files with 1953723 additions and 0 deletions
--- a/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterCharacterTextSplitter/TextSplitterCharacterTextSplitter.node.ts
+++ b/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterCharacterTextSplitter/TextSplitterCharacterTextSplitter.node.ts
@@ -0,0 +1,87 @@
+import type { CharacterTextSplitterParams } from '@langchain/textsplitters';
+import { CharacterTextSplitter } from '@langchain/textsplitters';
+import {
+	NodeConnectionTypes,
+	type INodeType,
+	type INodeTypeDescription,
+	type ISupplyDataFunctions,
+	type SupplyData,
+} from 'n8n-workflow';
+
+import { logWrapper } from '@utils/logWrapper';
+import { getConnectionHintNoticeField } from '@utils/sharedFields';
+
+export class TextSplitterCharacterTextSplitter implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Character Text Splitter',
+		name: 'textSplitterCharacterTextSplitter',
+		icon: 'fa:grip-lines-vertical',
+		iconColor: 'black',
+		group: ['transform'],
+		version: 1,
+		description: 'Split text into chunks by characters',
+		defaults: {
+			name: 'Character Text Splitter',
+		},
+		codex: {
+			categories: ['AI'],
+			subcategories: {
+				AI: ['Text Splitters'],
+			},
+			resources: {
+				primaryDocumentation: [
+					{
+						url: 'https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.textsplittercharactertextsplitter/',
+					},
+				],
+			},
+		},
+
+		inputs: [],
+
+		outputs: [NodeConnectionTypes.AiTextSplitter],
+		outputNames: ['Text Splitter'],
+		properties: [
+			getConnectionHintNoticeField([NodeConnectionTypes.AiDocument]),
+			{
+				displayName: 'Separator',
+				name: 'separator',
+				type: 'string',
+				default: '',
+			},
+			{
+				displayName: 'Chunk Size',
+				name: 'chunkSize',
+				type: 'number',
+				default: 1000,
+			},
+			{
+				displayName: 'Chunk Overlap',
+				name: 'chunkOverlap',
+				type: 'number',
+				default: 0,
+			},
+		],
+	};
+
+	async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
+		this.logger.debug('Supply Data for Text Splitter');
+
+		const separator = this.getNodeParameter('separator', itemIndex) as string;
+		const chunkSize = this.getNodeParameter('chunkSize', itemIndex) as number;
+		const chunkOverlap = this.getNodeParameter('chunkOverlap', itemIndex) as number;
+
+		const params: CharacterTextSplitterParams = {
+			separator,
+			chunkSize,
+			chunkOverlap,
+			keepSeparator: false,
+		};
+
+		const splitter = new CharacterTextSplitter(params);
+
+		return {
+			response: logWrapper(splitter, this),
+		};
+	}
+}
--- a/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterRecursiveCharacterTextSplitter/TextSplitterRecursiveCharacterTextSplitter.node.ts
+++ b/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterRecursiveCharacterTextSplitter/TextSplitterRecursiveCharacterTextSplitter.node.ts
@@ -0,0 +1,127 @@
+import type {
+	RecursiveCharacterTextSplitterParams,
+	SupportedTextSplitterLanguage,
+} from '@langchain/textsplitters';
+import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
+import {
+	NodeConnectionTypes,
+	type INodeType,
+	type INodeTypeDescription,
+	type ISupplyDataFunctions,
+	type SupplyData,
+} from 'n8n-workflow';
+
+import { logWrapper } from '@utils/logWrapper';
+import { getConnectionHintNoticeField } from '@utils/sharedFields';
+
+const supportedLanguages: SupportedTextSplitterLanguage[] = [
+	'cpp',
+	'go',
+	'java',
+	'js',
+	'php',
+	'proto',
+	'python',
+	'rst',
+	'ruby',
+	'rust',
+	'scala',
+	'swift',
+	'markdown',
+	'latex',
+	'html',
+];
+export class TextSplitterRecursiveCharacterTextSplitter implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Recursive Character Text Splitter',
+		name: 'textSplitterRecursiveCharacterTextSplitter',
+		icon: 'fa:grip-lines-vertical',
+		iconColor: 'black',
+		group: ['transform'],
+		version: 1,
+		description: 'Split text into chunks by characters recursively, recommended for most use cases',
+		defaults: {
+			name: 'Recursive Character Text Splitter',
+		},
+		codex: {
+			categories: ['AI'],
+			subcategories: {
+				AI: ['Text Splitters'],
+			},
+			resources: {
+				primaryDocumentation: [
+					{
+						url: 'https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.textsplitterrecursivecharactertextsplitter/',
+					},
+				],
+			},
+		},
+
+		inputs: [],
+
+		outputs: [NodeConnectionTypes.AiTextSplitter],
+		outputNames: ['Text Splitter'],
+		properties: [
+			getConnectionHintNoticeField([NodeConnectionTypes.AiDocument]),
+			{
+				displayName: 'Chunk Size',
+				name: 'chunkSize',
+				type: 'number',
+				default: 1000,
+			},
+			{
+				displayName: 'Chunk Overlap',
+				name: 'chunkOverlap',
+				type: 'number',
+				default: 0,
+			},
+			{
+				displayName: 'Options',
+				name: 'options',
+				placeholder: 'Add Option',
+				description: 'Additional options to add',
+				type: 'collection',
+				default: {},
+				options: [
+					{
+						displayName: 'Split Code',
+						name: 'splitCode',
+						default: 'markdown',
+						type: 'options',
+						options: supportedLanguages.map((lang) => ({ name: lang, value: lang })),
+					},
+				],
+			},
+		],
+	};
+
+	async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
+		this.logger.debug('Supply Data for Text Splitter');
+
+		const chunkSize = this.getNodeParameter('chunkSize', itemIndex) as number;
+		const chunkOverlap = this.getNodeParameter('chunkOverlap', itemIndex) as number;
+		const splitCode = this.getNodeParameter(
+			'options.splitCode',
+			itemIndex,
+			null,
+		) as SupportedTextSplitterLanguage | null;
+		const params: RecursiveCharacterTextSplitterParams = {
+			// TODO: These are the default values, should we allow the user to change them?
+			separators: ['\n\n', '\n', ' ', ''],
+			chunkSize,
+			chunkOverlap,
+			keepSeparator: false,
+		};
+		let splitter: RecursiveCharacterTextSplitter;
+
+		if (splitCode && supportedLanguages.includes(splitCode)) {
+			splitter = RecursiveCharacterTextSplitter.fromLanguage(splitCode, params);
+		} else {
+			splitter = new RecursiveCharacterTextSplitter(params);
+		}
+
+		return {
+			response: logWrapper(splitter, this),
+		};
+	}
+}
--- a/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TextSplitterTokenSplitter.node.ts
+++ b/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TextSplitterTokenSplitter.node.ts
@@ -0,0 +1,80 @@
+import {
+	NodeConnectionTypes,
+	type INodeType,
+	type INodeTypeDescription,
+	type ISupplyDataFunctions,
+	type SupplyData,
+} from 'n8n-workflow';
+
+import { logWrapper } from '@utils/logWrapper';
+import { getConnectionHintNoticeField } from '@utils/sharedFields';
+
+import { TokenTextSplitter } from './TokenTextSplitter';
+
+export class TextSplitterTokenSplitter implements INodeType {
+	description: INodeTypeDescription = {
+		displayName: 'Token Splitter',
+		name: 'textSplitterTokenSplitter',
+		icon: 'fa:grip-lines-vertical',
+		iconColor: 'black',
+		group: ['transform'],
+		version: 1,
+		description: 'Split text into chunks by tokens',
+		defaults: {
+			name: 'Token Splitter',
+		},
+		codex: {
+			categories: ['AI'],
+			subcategories: {
+				AI: ['Text Splitters'],
+			},
+			resources: {
+				primaryDocumentation: [
+					{
+						url: 'https://docs.n8n.io/integrations/builtin/cluster-nodes/sub-nodes/n8n-nodes-langchain.textsplittertokensplitter/',
+					},
+				],
+			},
+		},
+
+		inputs: [],
+
+		outputs: [NodeConnectionTypes.AiTextSplitter],
+		outputNames: ['Text Splitter'],
+		properties: [
+			getConnectionHintNoticeField([NodeConnectionTypes.AiDocument]),
+			{
+				displayName: 'Chunk Size',
+				name: 'chunkSize',
+				type: 'number',
+				default: 1000,
+			},
+			{
+				displayName: 'Chunk Overlap',
+				name: 'chunkOverlap',
+				type: 'number',
+				default: 0,
+			},
+		],
+	};
+
+	async supplyData(this: ISupplyDataFunctions, itemIndex: number): Promise<SupplyData> {
+		this.logger.debug('Supply Data for Text Splitter');
+
+		const chunkSize = this.getNodeParameter('chunkSize', itemIndex) as number;
+		const chunkOverlap = this.getNodeParameter('chunkOverlap', itemIndex) as number;
+
+		const splitter = new TokenTextSplitter({
+			chunkSize,
+			chunkOverlap,
+			allowedSpecial: 'all',
+			disallowedSpecial: 'all',
+			encodingName: 'cl100k_base',
+			keepSeparator: false,
+		});
+
+		return {
+			response: logWrapper(splitter, this),
+		};
+	}
+}
--- a/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts
+++ b/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/TokenTextSplitter.ts
@@ -0,0 +1,90 @@
+import type { TokenTextSplitterParams } from '@langchain/textsplitters';
+import { TextSplitter } from '@langchain/textsplitters';
+import { hasLongSequentialRepeat } from '@utils/helpers';
+import { getEncoding } from '@utils/tokenizer/tiktoken';
+import { estimateTextSplitsByTokens } from '@utils/tokenizer/token-estimator';
+import type * as tiktoken from 'js-tiktoken';
+
+/**
+ * Implementation of splitter which looks at tokens.
+ * This is override of the LangChain TokenTextSplitter
+ * to use the n8n tokenizer utility which uses local JSON encodings
+ */
+export class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
+	static lc_name() {
+		return 'TokenTextSplitter';
+	}
+
+	encodingName: tiktoken.TiktokenEncoding;
+
+	allowedSpecial: 'all' | string[];
+
+	disallowedSpecial: 'all' | string[];
+
+	private tokenizer: tiktoken.Tiktoken | undefined;
+
+	constructor(fields?: Partial<TokenTextSplitterParams>) {
+		super(fields);
+
+		this.encodingName = fields?.encodingName ?? 'cl100k_base';
+		this.allowedSpecial = fields?.allowedSpecial ?? [];
+		this.disallowedSpecial = fields?.disallowedSpecial ?? 'all';
+	}
+
+	async splitText(text: string): Promise<string[]> {
+		try {
+			// Validate input
+			if (!text || typeof text !== 'string') {
+				return [];
+			}
+
+			// Check for repetitive content
+			if (hasLongSequentialRepeat(text)) {
+				const splits = estimateTextSplitsByTokens(
+					text,
+					this.chunkSize,
+					this.chunkOverlap,
+					this.encodingName,
+				);
+				return splits;
+			}
+
+			// Use tiktoken for normal text
+			try {
+				this.tokenizer ??= getEncoding(this.encodingName);
+
+				const splits: string[] = [];
+				const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
+
+				let start_idx = 0;
+				let chunkCount = 0;
+
+				while (start_idx < input_ids.length) {
+					if (start_idx > 0) {
+						start_idx = Math.max(0, start_idx - this.chunkOverlap);
+					}
+					const end_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
+					const chunk_ids = input_ids.slice(start_idx, end_idx);
+
+					splits.push(this.tokenizer.decode(chunk_ids));
+
+					chunkCount++;
+					start_idx = end_idx;
+				}
+
+				return splits;
+			} catch (tiktokenError) {
+				// Fall back to character-based splitting if tiktoken fails
+				return estimateTextSplitsByTokens(
+					text,
+					this.chunkSize,
+					this.chunkOverlap,
+					this.encodingName,
+				);
+			}
+		} catch (error) {
+			// Return empty array on complete failure
+			return [];
+		}
+	}
+}
--- a/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts
+++ b/n8n-n8n-1.109.2/packages/@n8n/nodes-langchain/nodes/text_splitters/TextSplitterTokenSplitter/tests/TokenTextSplitter.test.ts
@@ -0,0 +1,345 @@
+import { OperationalError } from 'n8n-workflow';
+
+import * as helpers from '../../../../utils/helpers';
+import * as tiktokenUtils from '../../../../utils/tokenizer/tiktoken';
+import * as tokenEstimator from '../../../../utils/tokenizer/token-estimator';
+import { TokenTextSplitter } from '../TokenTextSplitter';
+
+jest.mock('../../../../utils/tokenizer/tiktoken');
+jest.mock('../../../../utils/helpers');
+jest.mock('../../../../utils/tokenizer/token-estimator');
+
+describe('TokenTextSplitter', () => {
+	let mockTokenizer: jest.Mocked<{
+		encode: jest.Mock;
+		decode: jest.Mock;
+	}>;
+
+	beforeEach(() => {
+		mockTokenizer = {
+			encode: jest.fn(),
+			decode: jest.fn(),
+		};
+		(tiktokenUtils.getEncoding as jest.Mock).mockReturnValue(mockTokenizer);
+		// Default mock for hasLongSequentialRepeat - no repetition
+		(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+	});
+
+	afterEach(() => {
+		jest.clearAllMocks();
+	});
+
+	describe('constructor', () => {
+		it('should initialize with default parameters', () => {
+			const splitter = new TokenTextSplitter();
+
+			expect(splitter.encodingName).toBe('cl100k_base');
+			expect(splitter.allowedSpecial).toEqual([]);
+			expect(splitter.disallowedSpecial).toBe('all');
+		});
+
+		it('should initialize with custom parameters', () => {
+			const splitter = new TokenTextSplitter({
+				encodingName: 'o200k_base',
+				allowedSpecial: ['<|special|>'],
+				disallowedSpecial: ['<|bad|>'],
+				chunkSize: 500,
+				chunkOverlap: 50,
+			});
+
+			expect(splitter.encodingName).toBe('o200k_base');
+			expect(splitter.allowedSpecial).toEqual(['<|special|>']);
+			expect(splitter.disallowedSpecial).toEqual(['<|bad|>']);
+			expect(splitter.chunkSize).toBe(500);
+			expect(splitter.chunkOverlap).toBe(50);
+		});
+
+		it('should have correct lc_name', () => {
+			expect(TokenTextSplitter.lc_name()).toBe('TokenTextSplitter');
+		});
+	});
+
+	describe('splitText', () => {
+		it('should split text into chunks based on token count', async () => {
+			const splitter = new TokenTextSplitter({
+				chunkSize: 3,
+				chunkOverlap: 0,
+			});
+
+			const inputText = 'Hello world, this is a test';
+			const mockTokenIds = [1, 2, 3, 4, 5, 6, 7, 8];
+
+			mockTokenizer.encode.mockReturnValue(mockTokenIds);
+			mockTokenizer.decode.mockImplementation((tokens: number[]) => {
+				const chunks = [
+					[1, 2, 3],
+					[4, 5, 6],
+					[7, 8],
+				];
+				const chunkTexts = ['Hello world,', ' this is', ' a test'];
+				const index = chunks.findIndex(
+					(chunk) => chunk.length === tokens.length && chunk.every((val, i) => val === tokens[i]),
+				);
+				return chunkTexts[index] || '';
+			});
+
+			const result = await splitter.splitText(inputText);
+
+			expect(tiktokenUtils.getEncoding).toHaveBeenCalledWith('cl100k_base');
+			expect(mockTokenizer.encode).toHaveBeenCalledWith(inputText, [], 'all');
+			expect(result).toEqual(['Hello world,', ' this is', ' a test']);
+		});
+
+		it('should handle empty text', async () => {
+			const splitter = new TokenTextSplitter();
+			mockTokenizer.encode.mockReturnValue([]);
+
+			const result = await splitter.splitText('');
+
+			expect(result).toEqual([]);
+		});
+
+		it('should handle text shorter than chunk size', async () => {
+			const splitter = new TokenTextSplitter({
+				chunkSize: 10,
+				chunkOverlap: 0,
+			});
+
+			const inputText = 'Short text';
+			const mockTokenIds = [1, 2];
+
+			mockTokenizer.encode.mockReturnValue(mockTokenIds);
+			mockTokenizer.decode.mockReturnValue('Short text');
+
+			const result = await splitter.splitText(inputText);
+
+			expect(result).toEqual(['Short text']);
+		});
+
+		it('should use custom encoding and special tokens', async () => {
+			const splitter = new TokenTextSplitter({
+				encodingName: 'o200k_base',
+				allowedSpecial: ['<|special|>'],
+				disallowedSpecial: ['<|bad|>'],
+			});
+
+			const inputText = 'Text with <|special|> tokens';
+			mockTokenizer.encode.mockReturnValue([1, 2, 3]);
+			mockTokenizer.decode.mockReturnValue('Text with <|special|> tokens');
+
+			await splitter.splitText(inputText);
+
+			expect(tiktokenUtils.getEncoding).toHaveBeenCalledWith('o200k_base');
+			expect(mockTokenizer.encode).toHaveBeenCalledWith(inputText, ['<|special|>'], ['<|bad|>']);
+		});
+
+		it('should reuse tokenizer on subsequent calls', async () => {
+			const splitter = new TokenTextSplitter();
+			mockTokenizer.encode.mockReturnValue([1, 2, 3]);
+			mockTokenizer.decode.mockReturnValue('test');
+
+			await splitter.splitText('first call');
+			await splitter.splitText('second call');
+
+			expect(tiktokenUtils.getEncoding).toHaveBeenCalledTimes(1);
+		});
+
+		it('should handle large text with multiple chunks and overlap', async () => {
+			const splitter = new TokenTextSplitter({
+				chunkSize: 2,
+				chunkOverlap: 1,
+			});
+
+			const inputText = 'One two three four five six';
+			const mockTokenIds = [1, 2, 3, 4, 5, 6];
+
+			mockTokenizer.encode.mockReturnValue(mockTokenIds);
+			mockTokenizer.decode.mockImplementation((tokens: number[]) => {
+				const chunkMap: Record<string, string> = {
+					'1,2': 'One two',
+					'2,3': 'two three',
+					'3,4': 'three four',
+					'4,5': 'four five',
+					'5,6': 'five six',
+				};
+				return chunkMap[tokens.join(',')] || '';
+			});
+
+			const result = await splitter.splitText(inputText);
+
+			expect(result).toEqual(['One two', 'two three', 'three four', 'four five', 'five six']);
+		});
+
+		describe('repetitive content handling', () => {
+			it('should use character-based estimation for repetitive content', async () => {
+				const splitter = new TokenTextSplitter({
+					chunkSize: 100,
+					chunkOverlap: 10,
+				});
+
+				const repetitiveText = 'a'.repeat(1000);
+				const estimatedChunks = ['chunk1', 'chunk2', 'chunk3'];
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
+
+				const result = await splitter.splitText(repetitiveText);
+
+				// Should not call tiktoken
+				expect(tiktokenUtils.getEncoding).not.toHaveBeenCalled();
+				expect(mockTokenizer.encode).not.toHaveBeenCalled();
+
+				// Should use estimation
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(repetitiveText);
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
+					repetitiveText,
+					100,
+					10,
+					'cl100k_base',
+				);
+
+				expect(result).toEqual(estimatedChunks);
+			});
+
+			it('should use tiktoken for non-repetitive content', async () => {
+				const splitter = new TokenTextSplitter({
+					chunkSize: 3,
+					chunkOverlap: 0,
+				});
+
+				const normalText = 'This is normal text without repetition';
+				const mockTokenIds = [1, 2, 3, 4, 5, 6];
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+				mockTokenizer.encode.mockReturnValue(mockTokenIds);
+				mockTokenizer.decode.mockImplementation(() => 'chunk');
+
+				await splitter.splitText(normalText);
+
+				// Should check for repetition
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(normalText);
+
+				// Should use tiktoken
+				expect(tiktokenUtils.getEncoding).toHaveBeenCalled();
+				expect(mockTokenizer.encode).toHaveBeenCalled();
+
+				// Should not use estimation
+				expect(tokenEstimator.estimateTextSplitsByTokens).not.toHaveBeenCalled();
+			});
+
+			it('should handle repetitive content with different encodings', async () => {
+				const splitter = new TokenTextSplitter({
+					encodingName: 'o200k_base',
+					chunkSize: 50,
+					chunkOverlap: 5,
+				});
+
+				const repetitiveText = '.'.repeat(500);
+				const estimatedChunks = ['estimated chunk 1', 'estimated chunk 2'];
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(estimatedChunks);
+
+				const result = await splitter.splitText(repetitiveText);
+
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
+					repetitiveText,
+					50,
+					5,
+					'o200k_base',
+				);
+				expect(result).toEqual(estimatedChunks);
+			});
+
+			it('should handle edge case with exactly 100 repeating characters', async () => {
+				const splitter = new TokenTextSplitter();
+				const edgeText = 'x'.repeat(100);
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue(['single chunk']);
+
+				const result = await splitter.splitText(edgeText);
+
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(edgeText);
+				expect(result).toEqual(['single chunk']);
+			});
+
+			it('should handle mixed content with repetitive sections', async () => {
+				const splitter = new TokenTextSplitter();
+				const mixedText = 'Normal text ' + 'z'.repeat(200) + ' more normal text';
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(true);
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
+					'chunk1',
+					'chunk2',
+				]);
+
+				const result = await splitter.splitText(mixedText);
+
+				expect(helpers.hasLongSequentialRepeat).toHaveBeenCalledWith(mixedText);
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalled();
+				expect(result).toEqual(['chunk1', 'chunk2']);
+			});
+		});
+
+		describe('error handling', () => {
+			it('should return empty array for null input', async () => {
+				const splitter = new TokenTextSplitter();
+				const result = await splitter.splitText(null as any);
+				expect(result).toEqual([]);
+			});
+
+			it('should return empty array for undefined input', async () => {
+				const splitter = new TokenTextSplitter();
+				const result = await splitter.splitText(undefined as any);
+				expect(result).toEqual([]);
+			});
+
+			it('should return empty array for non-string input', async () => {
+				const splitter = new TokenTextSplitter();
+				const result = await splitter.splitText(123 as any);
+				expect(result).toEqual([]);
+			});
+
+			it('should fall back to estimation if tiktoken fails', async () => {
+				const splitter = new TokenTextSplitter();
+				const text = 'This will cause tiktoken to fail';
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+				(tiktokenUtils.getEncoding as jest.Mock).mockImplementation(() => {
+					throw new Error('Tiktoken error');
+				});
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
+					'fallback chunk',
+				]);
+
+				const result = await splitter.splitText(text);
+
+				expect(result).toEqual(['fallback chunk']);
+				expect(tokenEstimator.estimateTextSplitsByTokens).toHaveBeenCalledWith(
+					text,
+					splitter.chunkSize,
+					splitter.chunkOverlap,
+					splitter.encodingName,
+				);
+			});
+
+			it('should fall back to estimation if encode fails', async () => {
+				const splitter = new TokenTextSplitter();
+				const text = 'This will cause encode to fail';
+
+				(helpers.hasLongSequentialRepeat as jest.Mock).mockReturnValue(false);
+				mockTokenizer.encode.mockImplementation(() => {
+					throw new OperationalError('Encode error');
+				});
+				(tokenEstimator.estimateTextSplitsByTokens as jest.Mock).mockReturnValue([
+					'fallback chunk',
+				]);
+
+				const result = await splitter.splitText(text);
+
+				expect(result).toEqual(['fallback chunk']);
+			});
+		});
+	});
+});