Module:Scripts/charToScript

< Module:Scripts
Revision as of 13:50, 26 January 2022 by wiktionary>Surjection
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)

Documentation for this module may be created at Module:Scripts/charToScript/doc

local subexport = {}

-- Copied from [[Module:Unicode data]].
local floor = math.floor
local function binaryRangeSearch(codepoint, ranges)
	local low, mid, high
	low, high = 1, ranges.length or require "Module:table".length(ranges)
	while low <= high do
		mid = floor((low + high) / 2)
		local range = ranges[mid]
		if codepoint < range[1] then
			high = mid - 1
		elseif codepoint <= range[2] then
			return range, mid
		else
			low = mid + 1
		end
	end
	return nil, mid
end

-- Copied from [[Module:Unicode data]].
local function linearRangeSearch(codepoint, ranges)
	for i, range in ipairs(ranges) do
		if codepoint < range[1] then
			break
		elseif codepoint <= range[2] then
			return range
		end
	end
end

local function compareRanges(range1, range2)
	return range1[1] < range2[1]
end

-- Save previously used codepoint ranges in case another character is in the
-- same range.
local rangesCache = {}

--[=[
	Takes a codepoint or a character and finds the script code (if any) that is
	appropriate for it based on the codepoint, using the data module
	[[Module:scripts/recognition data]]. The data module was generated from the
	patterns in [[Module:scripts/data]] using [[Module:User:Erutuon/script recognition]].

	Converts the character to a codepoint. Returns a script code if the codepoint
	is in the list of individual characters, or if it is in one of the defined
	ranges in the 4096-character block that it belongs to, else returns "None".
]=]
local charToScriptData
function subexport.charToScript(char)
	charToScriptData = charToScriptData or mw.loadData("Module:scripts/recognition data")
	local t = type(char)
	local codepoint
	if t == "string" then
		local etc
		codepoint, etc = mw.ustring.codepoint(char, 1, 2)
		if etc then
			error("bad argument #1 to 'charToScript' (expected a single character)")
		end
	elseif t == "number" then
		codepoint = char
	else
		error(("bad argument #1 to 'charToScript' (expected string or a number, got %s)")
			:format(t))
	end

	local individualMatch = charToScriptData.individual[codepoint]
	if individualMatch then
		return individualMatch
	else
		local range
		if rangesCache[1] then
			range = linearRangeSearch(codepoint, rangesCache)
			if range then
				return range[3]
			end
		end

		local index = floor(codepoint / 0x1000)

		range = linearRangeSearch(index, charToScriptData.blocks)
		if not range and charToScriptData[index] then
			range = binaryRangeSearch(codepoint, charToScriptData[index])
			if range then
				table.insert(rangesCache, range)
				table.sort(rangesCache, compareRanges)
			end
		end
		
		return range and range[3] or "None"
	end
end

function subexport.findBestScriptWithoutLang(text)
	local scripts = {}
	for character in text:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
		local script = subexport.charToScript(character)
		scripts[script] = (scripts[script] or 0) + 1
	end
	
	local bestScript
	local greatestCount = 0
	for script, count in pairs(scripts) do
		if count > greatestCount then
			bestScript = script
			greatestCount = count
		end
	end
	
	return bestScript
end

return subexport