tokenizer.js

/** @module tokenizer */

import Token from "./token.js"

/**
 * @memberof: module:tokenizer
 */
class Tokenizer {

	/**
	 * @type {string}
	 */
	static CHANNEL_PREFIX = "#"
	/**
	 * @type {string}
	 */
	static USER_PREFIX = "@"
	/**
	 * @type {Array}
	 */
	static SEPARATOR_CHARACTERS = ["\t", "\n", "\r\n", " "]
	/**
	 * @type {array}
	 */
	static CHANNEL_ALLOWED_CHARACTERS = [..."abcdefghijklmnopqrstuvwxyz1234567890"]
	/**
	 * @type {array}
	 */
	static USER_ALLOWED_CHARACTERS = [..."abcdefghijklmnopqrstuvwxyz1234567890-_"]

	/**
	 * @param   {string} text
	 * @returns {Token[]}
	 */
	static tokenize(text) {
		const characters = [ ...text ]
		const tokens = []
		let buffer = ""
		for(const [index, character] of characters.entries()) {

			buffer += character

			let nextCharacter = null
			if(index + 1 <= characters.length - 1) {
				nextCharacter = characters[index + 1]
			}

			let lastToken = tokens[tokens.length - 1]

			if(buffer[0] === Tokenizer.CHANNEL_PREFIX && buffer.length > 1 && (nextCharacter === null || Tokenizer.SEPARATOR_CHARACTERS.includes(nextCharacter))
				&& (!lastToken || Tokenizer.SEPARATOR_CHARACTERS.includes(lastToken.buffer.slice(-1)))) {
				tokens.push(new Token(Token.TYPE.CHANNEL, buffer, index - buffer.length + 1))
				buffer = ""
			} else if(buffer[0] === Tokenizer.USER_PREFIX && buffer.length > 1 && (nextCharacter === null || Tokenizer.SEPARATOR_CHARACTERS.includes(nextCharacter))
				&& (!lastToken || Tokenizer.SEPARATOR_CHARACTERS.includes(lastToken.buffer.slice(-1)))) {
				tokens.push(new Token(Token.TYPE.USER, buffer, index - buffer.length + 1))
				buffer = ""
			} else if(nextCharacter === null || (!Tokenizer.USER_ALLOWED_CHARACTERS.includes(nextCharacter.toLowerCase()) && !Tokenizer.CHANNEL_ALLOWED_CHARACTERS.includes(nextCharacter.toLowerCase()))) {
				if(lastToken && lastToken.type === Token.TYPE.TEXT) {
					let newBuffer = tokens[tokens.length - 1].buffer + buffer
					let bufferIndex = index - buffer.length - tokens[tokens.length - 1].buffer.length + 1
					tokens[tokens.length - 1] = new Token(Token.TYPE.TEXT, newBuffer, bufferIndex)
				} else {
					tokens.push(new Token(Token.TYPE.TEXT, buffer, index - buffer.length + 1))
				}
				buffer = ""
			}

		}
		return tokens
	}

}

export default Tokenizer