630
edits
(move some data to a data submodule) |
m (1 revision imported) |
||
Line 1: | Line 1: | ||
local | local p = {} | ||
local floor = math.floor | local floor = math.floor | ||
local function errorf( | local function errorf(level, ...) | ||
if type( | if type(level) == "number" then | ||
return error(string.format(...), | return error(string.format(...), level + 1) | ||
else | else -- level is actually the format string. | ||
return error(string.format( | return error(string.format(level, ...), 2) | ||
end | end | ||
end | end | ||
Line 14: | Line 13: | ||
local function binary_range_search(codepoint, ranges) | local function binary_range_search(codepoint, ranges) | ||
local low, mid, high | local low, mid, high | ||
low, high = 1, ranges.length or require "Module: | low, high = 1, ranges.length or require "Module:TableTools".length(ranges) | ||
while low <= high do | while low <= high do | ||
mid = floor((low + high) / 2) | mid = floor((low + high) / 2) | ||
Line 28: | Line 27: | ||
return nil, mid | return nil, mid | ||
end | end | ||
p.binary_range_search = binary_range_search | |||
--[[ | |||
local function linear_range_search(codepoint, ranges) | local function linear_range_search(codepoint, ranges) | ||
for i, range in ipairs(ranges) do | for i, range in ipairs(ranges) do | ||
if | if range[1] <= codepoint and codepoint <= range[2] then | ||
return range | return range | ||
end | end | ||
end | end | ||
end | end | ||
--]] | |||
-- Load a module by indexing "loader" with the name of the module minus the | -- Load a module by indexing "loader" with the name of the module minus the | ||
Line 58: | Line 57: | ||
-- see "Hangul Syllable Name Generation" in section 3.12 of the | -- see "Hangul Syllable Name Generation" in section 3.12 of the | ||
-- Unicode Specification: | -- Unicode Specification: | ||
-- https://www.unicode.org/versions/ | -- https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf | ||
local name_hooks = { | local name_hooks = { | ||
{ 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters | { 0x00, 0x1F, "<control-%04X>" }, -- C0 control characters | ||
Line 85: | Line 80: | ||
{ 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | { 0xF900, 0xFA6D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
{ 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | { 0xFA70, 0xFAD9, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
{ 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut | { 0x17000, 0x187F7, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph | ||
{ 0x18800, 0x18AFF, function (codepoint) | { 0x18800, 0x18AFF, function (codepoint) | ||
return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | return ("TANGUT COMPONENT-%03d"):format(codepoint - 0x187FF) | ||
end }, | end }, | ||
{ 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut | { 0x18D00, 0x18D08, "TANGUT IDEOGRAPH-%04X" }, -- Tangut Ideograph Supplement | ||
{ 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | { 0x1B170, 0x1B2FB, "NUSHU CHARACTER-%04X" }, -- Nushu | ||
{ 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | { 0x20000, 0x2A6DF, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B | ||
{ 0x2A700, 0x2B738, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | { 0x2A700, 0x2B738, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C | ||
{ | { 0x2B740, 0x2B81D, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D | ||
{ 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | { 0x2B820, 0x2CEA1, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension E | ||
{ 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | { 0x2CEB0, 0x2EBE0, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension F | ||
-- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) | -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane) | ||
{ 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | { 0x2F800, 0x2FA1D, "CJK COMPATIBILITY IDEOGRAPH-%04X" }, | ||
{ 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | { 0xE0100, 0xE01EF, function (codepoint) -- Variation Selectors Supplement | ||
return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | return ("VARIATION SELECTOR-%d"):format(codepoint - 0xE0100 + 17) | ||
end}, | end}, | ||
{ 0x30000, 0x3134A, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension G | |||
{ 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | { 0xF0000, 0xFFFFD, "<private-use-%04X>" }, -- Plane 15 Private Use | ||
{ 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use | { 0x100000, 0x10FFFD, "<private-use-%04X>" } -- Plane 16 Private Use | ||
Line 130: | Line 124: | ||
--]] | --]] | ||
-- https://www.unicode.org/versions/ | -- https://www.unicode.org/versions/Unicode11.0.0/ch04.pdf, section 4.8 | ||
function | function p.lookup_name(codepoint) | ||
-- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | -- U+FDD0-U+FDEF and all code points ending in FFFE or FFFF are Unassigned | ||
-- (Cn) and specifically noncharacters: | -- (Cn) and specifically noncharacters: | ||
Line 166: | Line 160: | ||
end | end | ||
function | --[[ | ||
-- No image data modules on Wikipedia yet. | |||
function p.lookup_image(codepoint) | |||
local data = loader[('images/%03X'):format(codepoint / 0x1000)] | local data = loader[('images/%03X'):format(codepoint / 0x1000)] | ||
Line 173: | Line 169: | ||
end | end | ||
end | end | ||
--]] | |||
local planes = { | |||
[ 0] = "Basic Multilingual Plane"; | |||
[ 1] = "Supplementary Multilingual Plane"; | |||
[ 2] = "Supplementary Ideographic Plane"; | |||
[ 3] = "Tertiary Ideographic Plane"; | |||
[14] = "Supplementary Special-purpose Plane"; | |||
[15] = "Supplementary Private Use Area-A"; | |||
[16] = "Supplementary Private Use Area-B"; | |||
} | |||
-- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. | -- Load [[Module:Unicode data/blocks]] if needed and assign it to this variable. | ||
Line 182: | Line 189: | ||
if data then | if data then | ||
-- Unpack doesn't work on tables loaded with mw.loadData. | -- Unpack doesn't work on tables loaded with mw.loadData. | ||
return i, data[ | return i, data[1], data[2], data[3] | ||
end | end | ||
end | end | ||
-- An ipairs-type iterator generator for the list of blocks. | -- An ipairs-type iterator generator for the list of blocks. | ||
function | function p.enum_blocks() | ||
local blocks = loader.blocks | local blocks = loader.blocks | ||
return block_iter, blocks, 0 | return block_iter, blocks, 0 | ||
end | end | ||
function | function p.lookup_plane(codepoint) | ||
local i = floor(codepoint / 0x10000) | local i = floor(codepoint / 0x10000) | ||
return | return planes[i] or ("Plane %u"):format(i) | ||
end | end | ||
function | function p.lookup_block(codepoint) | ||
local blocks = loader.blocks | local blocks = loader.blocks | ||
local range = binary_range_search(codepoint, blocks) | local range = binary_range_search(codepoint, blocks) | ||
Line 221: | Line 214: | ||
end | end | ||
function | function p.get_block_info(name) | ||
for i, block in ipairs(loader.blocks) do | for i, block in ipairs(loader.blocks) do | ||
if block[3] == name then | if block[3] == name then | ||
Line 229: | Line 222: | ||
end | end | ||
function | function p.is_valid_pagename(pagename) | ||
local has_nonws = false | local has_nonws = false | ||
Line 246: | Line 239: | ||
end | end | ||
local printable, result = | local printable, result = p.is_printable(cp) | ||
if not printable then | if not printable then | ||
return false | return false | ||
Line 335: | Line 328: | ||
end | end | ||
return match_func(codepoint | return match_func(codepoint) | ||
end | end | ||
end | end | ||
Line 345: | Line 338: | ||
-- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for | -- See https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values for | ||
-- more information. | -- more information. | ||
p.is_combining = memo_lookup( | |||
"combining", | "combining", | ||
function (codepoint, combining_class) | function (codepoint, combining_class) | ||
Line 352: | Line 345: | ||
0) | 0) | ||
function | function p.add_dotted_circle(str) | ||
return (mw.ustring.gsub(str, ".", | return (mw.ustring.gsub(str, ".", | ||
function(char) | function(char) | ||
if | if p.is_combining(mw.ustring.codepoint(char)) then | ||
return '◌' .. char | return '◌' .. char | ||
end | end | ||
Line 367: | Line 360: | ||
end, | end, | ||
"assigned") | "assigned") | ||
p.lookup_control = lookup_control | |||
function | function p.is_assigned(codepoint) | ||
return lookup_control(codepoint) ~= "unassigned" | return lookup_control(codepoint) ~= "unassigned" | ||
end | end | ||
function | function p.is_printable(codepoint) | ||
local result = lookup_control(codepoint) | local result = lookup_control(codepoint) | ||
return (result == "assigned") or (result == "space-separator"), result | return (result == "assigned") or (result == "space-separator"), result | ||
end | end | ||
function | function p.is_whitespace(codepoint) | ||
local result = lookup_control(codepoint) | local result = lookup_control(codepoint) | ||
return (result == "space-separator"), result | return (result == "space-separator"), result | ||
end | end | ||
p.lookup_category = memo_lookup( | |||
"category", | "category", | ||
function (codepoint, category) | function (codepoint, category) | ||
Line 390: | Line 383: | ||
"Cn") | "Cn") | ||
local lookup_script = memo_lookup( | |||
"scripts", | "scripts", | ||
function (codepoint, | function (codepoint, script_code) | ||
return | return script_code or 'Zzzz' | ||
end, | end, | ||
"Zzzz") | "Zzzz") | ||
p.lookup_script = lookup_script | |||
function p.get_best_script(str) | |||
-- Check type of argument, because mw.text.decode coerces numbers to strings! | |||
require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
-- Convert HTML character references (including named character references, | |||
-- or character entities) to characters. | |||
str = mw.text.decode(str, true) | |||
local scripts = {} | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
local script = lookup_script(codepoint) | |||
-- Ignore "Inherited", "Undetermined", or "Uncoded" scripts. | |||
if not (script == "Zyyy" or script == "Zinh" or script == "Zzzz") then | |||
scripts[script] = true | |||
end | |||
end | |||
-- If scripts does not contain two or more keys, | |||
-- return first and only key (script code) in table. | |||
if not next(scripts, next(scripts)) then | |||
return next(scripts) | |||
end -- else return majority script, or else "Zzzz"? | |||
end | |||
function p.is_Latin(str) | |||
require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
str = mw.text.decode(str, true) | |||
-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
-- code points U+0340-U+10FFFF. If they are not found and there is at least | |||
-- one Latin-script character, the string counts as Latin, because the rest | |||
-- of the characters can only be Zyyy, Zinh, and Zzzz. | |||
-- The only scripts found below U+0370 (the first code point of the Greek | |||
-- and Coptic block) are Latn, Zyyy, Zinh, and Zzzz. | |||
-- See the codepage in the [[UTF-8]] article. | |||
if not str:find "[\205-\244]" then | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
if lookup_script(codepoint) == "Latn" then | |||
return true | |||
end | |||
end | |||
end | |||
local Latn = false | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
local script = lookup_script(codepoint) | |||
if script == "Latn" then | |||
Latn = true | |||
elseif not (script == "Zyyy" or script == "Zinh" | |||
or script == "Zzzz") then | |||
return false | |||
end | |||
end | |||
return Latn | |||
end | |||
-- Checks that a string contains only characters belonging to right-to-left | |||
-- scripts, or characters of ignorable scripts. | |||
function p.is_rtl(str) | |||
require "libraryUtil".checkType("get_best_script", 1, str, "string") | |||
str = mw.text.decode(str, true) | |||
-- Search for the leading bytes that introduce the UTF-8 encoding of the | |||
-- code points U+0580-U+10FFFF. If they are not found, the string can only | |||
-- have characters from a left-to-right script, because the first code point | |||
-- in a right-to-left script is U+0591, in the Hebrew block. | |||
if not str:find "[\214-\244]" then | |||
return false | |||
end | |||
local result = false | |||
local rtl = loader.scripts.rtl | |||
for codepoint in mw.ustring.gcodepoint(str) do | |||
local script = lookup_script(codepoint) | |||
if rtl[script] then | |||
result = true | |||
elseif not (script == "Zyyy" or script == "Zinh" | |||
or script == "Zzzz") then | |||
return false | |||
end | |||
end | |||
return result | |||
end | |||
function | local function get_codepoint(args, arg) | ||
if | local codepoint_string = args[arg] | ||
or errorf(2, "Parameter %s is required", tostring(arg)) | |||
local codepoint = tonumber(codepoint_string, 16) | |||
or errorf(2, "Parameter %s is not a code point in hexadecimal base", | |||
tostring(arg)) | |||
if not (0 <= codepoint and codepoint <= 0x10FFFF) then | |||
errorf(2, "code point in parameter %s out of range", tostring(arg)) | |||
end | end | ||
if | return codepoint | ||
return | end | ||
local function get_func(args, arg, prefix) | |||
local suffix = args[arg] | |||
or errorf(2, "Parameter %s is required", tostring(arg)) | |||
suffix = mw.text.trim(suffix) | |||
local func_name = prefix .. suffix | |||
local func = p[func_name] | |||
or errorf(2, "There is no function '%s'", func_name) | |||
return func | |||
end | |||
-- This function allows any of the "lookup" functions to be invoked. The first | |||
-- parameter is the word after "lookup_"; the second parameter is the code point | |||
-- in hexadecimal base. | |||
function p.lookup(frame) | |||
local func = get_func(frame.args, 1, "lookup_") | |||
local codepoint = get_codepoint(frame.args, 2) | |||
local result = func(codepoint) | |||
if func == p.lookup_name then | |||
-- Prevent code point labels such as <control-0000> from being | |||
-- interpreted as HTML tags. | |||
result = result:gsub("<", "<") | |||
end | |||
return result | |||
end | |||
function p.is(frame) | |||
local func = get_func(frame.args, 1, "is_") | |||
-- is_Latin and is_valid_pagename take strings. | |||
if func == p.is_Latin or func == p.is_valid_pagename or func == p.is_rtl then | |||
return (func(frame.args[2])) | |||
else -- The rest take code points. | |||
local codepoint = get_codepoint(frame.args, 2) | |||
return (func(codepoint)) -- Adjust to one result. | |||
end | end | ||
end | end | ||
return | return p |