Documentation for this module may be created at Module:Lang/doc

--[=[
This is an experiment to see what is required to consolodate all of the myriad {{lang-xx}} templates
and their subtemplates into a single module with a data table

{{lang-es}} has parameters:
{{{1}}} text - required
{{{link}}} or {{{links}}} defaults to 'yes'
{{{lit}}} literal translation

{{language with name}} has parameters:
{{{1}}} iso language code - required
{{{2}}} language name in English - superfluous?
{{{3}}} text - required
{{{4}}} or {{{lit}}} literal translation
{{{link}}} or {{{links}}} set to 'no' disables '{{{2}} language' wikilinks
{{{rtl}}} passed through to {{lang}}
{{{nocat}}} passed through to {{lang}}

{{lang}} has parameters:
{{{1}}} iso language code - required
{{{2}}} text - required
{{{rtl}}} if set to any value, set dir="rtl" attribute and inserts &lrm; after the </span> tag
{{{nocat}}} if set to any value, disables categorization

How it works now:
	1. {{lang-es}} receives text as {{{1}}} to which it adds italic markup, sets |links to {{{link}}} or {{{links}}} or yes, sets |lit to {{{lit}}}, and calls {{language with name}}.  {{{rtl}}} ignored for this example
	2. {{language with name}} renders [[Spanish language|Spanish]] ({{{links}}} not set) or Spanish ({{{links}}} set) and calls {{lang}}
	3. {{lang}} wraps the text in the <span lang="es">text</span> and adds categorization
]=]

require('Module:No globals');
local p = {};

local getArgs = require ('Module:Arguments').getArgs;
local lang_name_table = require ('Module:Language/name/data');

--local script_name_table = mw.loadData ('Module:Language/data/iana scripts');
--local region_name_table = mw.loadData ('Module:Language/data/iana regions');
--local variant_name_table = mw.loadData ('Module:Language/data/iana variants');

local lang_data =  mw.loadData ('Module:Lang/data');							-- language name override and transliteration tool-tip tables

local namespace = mw.title.getCurrentTitle().namespace;							-- used for categorization


--[[--------------------------< I S _ S E T >------------------------------------------------------------------

Returns true if argument is set; false otherwise. Argument is 'set' when it exists (not nil) or when it is not an empty string.

]]

local function is_set( var )
	return not (var == nil or var == '');
end


--[[--------------------------< I N _ A R R A Y >--------------------------------------------------------------

Whether needle is in haystack

]]

local function in_array( needle, haystack )
	if needle == nil then
		return false;
	end
	for n,v in ipairs( haystack ) do
		if v == needle then
			return n;
		end
	end
	return false;
end


--[[--------------------------< F O R M A T _ I E T F _ T A G >------------------------------------------------

prettify ietf tags to use recommended subtag formats:
	code: lower case
	script: sentence case
	region: upper case
	variant: lower case

]]

local function format_ietf_tag (code, script, region, variant)
	local out = {};
	local c;
	
	table.insert (out, code:lower());
	if is_set (script) then
		c = script:match ('^%a'):upper();										-- make script sentence case
		script = script:lower():gsub ('^%a', c, 1);
		table.insert (out, script);
	end

	if is_set (region) then
		table.insert (out, region:upper());
	end
	
	if is_set (variant) then
		table.insert (out, variant:lower());
	end
	
	return table.concat (out, '-');
end


--[[--------------------------< G E T _ I E T F _ P A R T S >--------------------------------------------------

extracts and returns IETF language tag parts:
	primary language subtag (required) - 2 or 3 character lower case IANA language code [ll]
	script subtag - four character title-case IANA script code [Ssss]
	region subtag - two-character upper-case IANA region code [RR]

in any one of these forms
	ll (or lll)
	ll-Ssss
	ll-RR
	ll-Ssss-RR
each of ll, Ssss, and RR when used must be valid

returns three values.  Valid parts are return as themselves; omitted parts are returned as empty strings, invalid
parts are returned as nil.

see http://www.rfc-editor.org/rfc/bcp/bcp47.txt section 2.1

]]

local function get_ietf_parts (source)
	local code;
	local script = '';
	local region = '';
	local variant = '';
	local c;
	
	if not is_set (source) then
		return nil, nil, nil, nil;
	end
	
	if source:match ('^%a+%-%a%a%a%a%-%a%a%-(%d%d%d%d+$') then												-- ll-Ssss-RR-variant (where variant is 4 digits)
		code, script, region, variant = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)%-(%d%d%d%d)$');
	elseif source:match ('^%a+%-%a%a%a%a%-%d%d%d%-(%d%d%d%d+$') then										-- ll-Ssss-DDD-variant (where region is 3 digits; variant is 4 digits)
		code, script, region, variant = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)%-(%d%d%d%d)$');
	elseif source:match ('^%a+%-%a%a%a%a%-%a%a%-[%a%d][%a%d][%a%d][%a%d][%a%d]+$') then						-- ll-Ssss-RR-variant (where variant is 5-8 alnum characters)
		code, script, region, variant = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$');
	elseif source:match ('^%a+%-%a%a%a%a%-%d%d%d%-[%a%d][%a%d][%a%d][%a%d][%a%d]+$') then						-- ll-Ssss-DDD-variant (where region is 3 digits; variant is 5-8 alnum characters)
		code, script, region, variant = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$');

	elseif source:match ('^%a+%-%a%a%a%a%-(%d%d%d%d+$') then												-- ll-Ssss-variant (where variant is 4 digits)
		code, script, variant = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d%d)$');
	elseif source:match ('^%a+%-%a%a%a%a%-[%a%d][%a%d][%a%d][%a%d][%a%d]+$') then							-- ll-Ssss-variant (where variant is 5-8 alnum characters)
		code, script, variant = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$');

	elseif source:match ('^%a+%-%a%a%-(%d%d%d%d+$') then													-- ll-RR-variant (where variant is 4 digits)
		code, region, variant = source:match ('^(%a%a%a?)%-(%a%a)%-(%d%d%d%d)$');
	elseif source:match ('^%a+%-%d%d%d%-(%d%d%d%d+$') then													-- ll-DDD-variant (where region is 3 digits; variant is 4 digits)
		code, region, variant = source:match ('^(%a%a%a?)%-(%d%d%d)%-(%d%d%d%d)$');
	elseif source:match ('^%a+%-%a%a%-[%a%d][%a%d][%a%d][%a%d][%a%d]+$') then								-- ll-RR-variant (where variant is 5-8 alnum characters)
		code, region, variant = source:match ('^(%a%a%a?)%-(%a%a)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$');
	elseif source:match ('^%a+%-%d%d%d%-[%a%d][%a%d][%a%d][%a%d][%a%d]+$') then								-- ll-DDD-variant (where region is 3 digits; variant is 4 digits)
		code, region, variant = source:match ('^(%a%a%a?)%-(%d%d%d)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$');

	elseif source:match ('^%a+%-(%d%d%d%d+)$') then								-- ll-variant (where variant is 4 digits)
		code, variant = source:match ('^(%a%a%a?)%-(%d%d%d%d)$');
	elseif source:match ('^%a+%-[%a%d][%a%d][%a%d][%a%d][%a%d]+$') then			-- ll-variant (where variant is 5-8 alnum characters)
		code, variant = source:match ('^(%a%a%a?)%-([%a%d][%a%d][%a%d][%a%d][%a%d][%a%d]?[%a%d]?[%a%d]?)$');

	elseif source:match ('^%a+%-%a%a%a%a%-%a%a$') then							-- ll-Ssss-RR
		code, script, region = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%a%a)$');
	elseif source:match ('^%a+%-%a%a%a%a%-%d%d%d$') then						-- ll-Ssss-DDD (region is 3 digits)
		code, script, region = source:match ('^(%a%a%a?)%-(%a%a%a%a)%-(%d%d%d)$');

	elseif source:match ('^%a+%-%a%a%a%a$') then								-- ll-Ssss
		code, script = source:match ('^(%a%a%a?)%-(%a%a%a%a)$');

	elseif source:match ('^%a+%-%a%a$') then									-- ll-RR
		code, region = source:match ('^(%a%a%a?)%-(%a%a)$');
	elseif source:match ('^%a+%-%d%d%d$') then									-- ll-DDD (region is 3 digits)
		code, region = source:match ('^(%a%a%a?)%-(%d%d%d)$');

	elseif source:match ('^%a+$') then											-- ll
		code = source:match ('^(%a%a%a?)$');

	else
		return nil, nil, nil, nil;												-- don't know what we got but it is malformed
	end
	
	code = code:lower();														-- ensure that we use and return lower case version of this
	
	if not (lang_data.override[code] or lang_name_table.lang[code]) then
		return nil, nil, nil, nil;												-- invalid language code, don't know about the others (don't care?)
	end
	
	if is_set (script) then
		if not lang_name_table.script[script:lower()] then
			return code, nil, nil, nil;											-- language code ok, invalid script, don't know about the others (don't care?)
		end
	end
	
	if is_set (region) then
		if not lang_name_table.region[region:lower()] then
			return code, script, nil, nil;
		end
	end
	
	if is_set (variant) then
		if not lang_name_table.variant[variant:lower()] then
			return code, script, region, nil;
		end
		if not in_array (code, lang_name_table.variant[variant:lower()]['prefixes']) then
			return code, script, region, nil;
		end
	end

	return code, script, region, variant;										-- return the good bits
end


--[=[-------------------------< M A K E _ E R R O R _ M S G >--------------------------------------------------

]=]

local function make_error_msg (msg, nocat)
	local out = {};
	
	table.insert (out, '<span style="font-size:100%" class="error">error: ');
	table.insert (out, msg);
	table.insert (out, '</span>');
	
--	if (0 == namespace) and not is_set (nocat) then								-- only categorize in article space
		table.insert (out, '[[Category:lang and lang-xx template errors]]');
--	end

	
	return table.concat (out);
end
	

--[=[-------------------------< M A K E _ W I K I L I N K >----------------------------------------------------

Makes a wikilink; when both link and display text is provided, returns a wikilink in the form [[L|D]]; if only
link is provided, returns a wikilink in the form [[L]]; if neither are provided or link is omitted, returns an
empty string.

]=]

local function make_wikilink (link, display)
	if is_set (link) then
		if is_set (display) then
			return table.concat ({'[[', link, '|', display, ']]'});
		else
			return table.concat ({'[[', link, ']]'});
		end
	else
		return '';
	end
end


--[[--------------------------< M A K E _ T E X T _ S P A N >--------------------------------------------------

]]

local function make_text_span (code, text, rtl, italic, size)
	local span = {};

	table.insert (span, '<span lang="');										-- open <span> tag
	table.insert (span,  code);													-- language attribute
	table.insert (span, '"');
	if 'yes' == rtl then
		table.insert (span, ' dir="rtl"');										-- for right to left languages
	end
	if is_set (size) then														-- {{lang}} only
		table.insert (span, table.concat ({' style="font-size:', size, ';"'}))
	end
	table.insert (span, '>');													-- close the opening span tag
	if 'yes' == italic then
		table.insert (span, table.concat ({"''", text, "''"}));					-- text with italic markup
	else
		table.insert (span, text);												-- DEFAULT: text is not italicized
	end
	table.insert (span, '</span>');												-- close the span
	if 'yes' == rtl then
		table.insert (span, '&lrm;');											-- make sure the browser knows that we're at the end of the rtl
	end
	
	return table.concat (span);													-- put it all together and done
end


--[[--------------------------< M A K E _ C A T E G O R Y >----------------------------------------------------

]]

local function make_category (code, language_name, nocat)
	local cat = {};
	
	if (0 ~= namespace) or nocat then											-- only categorize in article space
		return '';																-- return empty string for concatenation
	end
	
	table.insert (cat, '[[Category:Articles containing ');
	
	if ('en' == code) or ('eng' == code) then
		table.insert (cat, 'explicitly cited English');
	elseif 'art' == code then
		table.insert (cat, 'constructed')
	else
		table.insert (cat, language_name);
	end
	
	table.insert (cat, '-language text]]');

	return table.concat (cat);	
end


--[[--------------------------< M A K E _ T R A N S L I T >----------------------------------------------------

return translit <span>...</span> else return empty string

The value |script= is not used in {{transl}} for this purpose; instead it uses |code.  Because language scripts
are listed in the {{transl}} switches they are included in the data tables.  The script parameter is introduced
at {{Language with name and transliteration}}.  If |script= is set, this function uses it in preference to code.

]]

local function make_translit (code, language_name, translit, std, script)
	local title;
	local tout = {};
	local title_table = lang_data.translit_title_table;						-- table of transliteration standards and the language codes and scripts that apply to those standards
	
	table.insert (tout, "''<span title=\"");
	
	if not is_set (std) and not is_set (script) then							-- when neither standard nor script specified
		table.insert (tout, language_name);										-- write a generic tool tip
		table.insert (tout, ' transliteration');
	elseif is_set (std) and is_set (script) then								-- when both are specified
		if title_table[std][script] then										-- and legitimate
			table.insert (tout, title_table[std][script]);						-- add the appropriate text to the tool tip
		else
			return '';															-- one or both invalid, set up for an error message
		end
	elseif is_set (std) then													-- script not set, use language code
		if not title_table[std] then return ''; end								-- invalid standard, setupt for error message
		
		if title_table[std][code] then
			table.insert (tout, title_table[std][code]);
		else																	-- code doesn't match
			table.insert (tout, title_table[std]['default']);					-- so use the standard's default
		end
	else																		-- here if script set but std not set
		if title_table['NO_STD'][script] then
			table.insert (tout, title_table['NO_STD'][script]);					-- use script if set
		elseif title_table['NO_STD'][code] then
			table.insert (tout, title_table['NO_STD'][code]);					-- use language code
		else
			table.insert (tout, language_name);									-- write a generic tool tip
			table.insert (tout, ' transliteration');
		end
	end

	table.insert (tout, '" class="Unicode" style="white-space:normal; text-decoration:none">');
	table.insert (tout, translit);
	table.insert (tout, "</span>''");
	return table.concat (tout);
end


--[[--------------------------< L A N G >----------------------------------------------------------------------

{{#invoke:lang|lang|code=<code>|text={{{1}}}|rtl={{{rtl|}}}|size={{{size|}}}|nocat={{{nocat|}}}}}

|code = the BCP47 language code
|text = the displayed text in language specified by code
|rtl = boolean true identifies the language specified by code as a right-to-left language
|size = css keyword appropriate for use with css font-size:<size>
|nocat = boolean true inhibits normal categorization; error categories are not affected

]]

function p.lang (frame)
	local args = getArgs(frame);
	
	local out = {};
	local language_name;
	local code, script, region, variant = get_ietf_parts (args.code);
	
	if not (code and script and region and variant) then
		return make_error_msg (table.concat ({'lang: unknown language code: ', args.code or 'missing'}), args.no_cat);
	end
	
	if not is_set (args.text) then
		return make_error_msg ('lang: no text', args.no_cat);
	end

	args.code = format_ietf_tag (code, script, region, variant);				-- format to recommended subtag styles

	if lang_data.override[code] then
		language_name = lang_data.override[code][1]
	elseif lang_name_table.lang[code] then
		language_name = lang_name_table.lang[code][1];							-- table entries sometimes have multiple names, always take the first one
	end

	table.insert (out, make_text_span (args.code, args.text, args.rtl, nil, args.size));		-- italics set to nil here because not supported by {{lang}}
	table.insert (out, make_category (args.code, language_name, args.nocat));
	if 0 == namespace then																		-- args.nocat intentionally ignored here
		table.insert (out, '[[Category:Lang and Lang-xx templates using Module:Lang]]');		-- keep until lang and all lang-xx templates converted to the module
	end
	return table.concat (out);													-- put it all together and done
	end


--[[--------------------------< L A N G _ X X >----------------------------------------------------------------

{{#invoke:lang|lang_xx|code=<code>|text={{{1}}}|link={{{links|{{{link}}}}}}|rtl={{{rtl|}}}|nocat={{{nocat|}}}|italic={{{italic|}}}|lit={{{lit|}}}|translit={{{translit|}}}|script={{{script|}}}|std={{{std|}}}}}

|code = the BCP47 language code
|text = the displayed text in language specified by code
|link = boolean true (default) links language specified by code to associated language article
|rtl = boolean true identifies the language specified by code as a right-to-left language
|nocat = boolean true inhibits normal categorization; error categories are not affected
|italic = boolean true (default) renders displayed text in italic font
|lit = text that is a literal translation of text

for those {{lang-xx}} templates that support transliteration:
|translit = text that is a transliteration of text
|std = the standard that applies to the transliteration
|script = ISO 15924 script name; falls back to code

]]

function p.lang_xx (frame)
	local args = getArgs(frame);
	
	if not is_set (args.italic) then
		args.italic = 'yes';													-- DEFAULT for {{lang-xx}} templates is to italicize
	end
	
	args.size = nil;															-- size not supported in {{lang-xx}}
	
	local out = {};
	local language_name;
	local code, script, region, variant = get_ietf_parts (args.code);

	local translit_script;
	local translit;
	local translit_title;
	
	if not (code and script and region and variant) then
		return make_error_msg (table.concat ({'lang-xx: unknown language code: ', args.code or 'missing'}), args.no_cat);
	end
	
	if not is_set (args.text) then
		return make_error_msg ('lang-xx: no text', args.no_cat);
	end

	args.code = format_ietf_tag (code, script, region, variant);				-- format to recommended subtag styles

	if lang_data.override[code] then
		language_name = lang_data.override[code][1]
	elseif lang_name_table.lang[code] then
		language_name = lang_name_table.lang[code][1];							-- table entries sometimes have multiple names, always take the first one
	end

	translit_script = args.script or language_name;								-- for translit prefer script over language

	if 'no' == args.link then
		table.insert (out, language_name);										-- language name without wikilink
	else
		table.insert (out, make_wikilink (language_name .. ' language', language_name));	-- language name with wikilink
	end
	table.insert (out, ': ');													-- separator

	table.insert (out, make_text_span (args.code, args.text, args.rtl, args.italic, args.size))
	
	if is_set (args.translit) then												-- transliteration (not supported in {{lang}}); not supported in all {{lang-xx}}
		table.insert (out, ', <small>');
		translit_title = mw.title.makeTitle (0, 'Romanization of ' .. language_name)
		if translit_title.exists and ('no' ~= args.link) then
			table.insert (out, make_wikilink ('Romanization of ' .. translit_script or language_name, 'translit.'));
		else
			table.insert (out, '<abbr title="transliteration">translit.</abbr>');
		end
		table.insert (out, '&nbsp;</small>');
		translit = make_translit (args.code, language_name, args.translit, args.std, args.script)
		if is_set (translit) then
			table.insert (out, translit);
		else
			return make_error_msg (table.concat ({'lang-xx: invalid translit std: \'', args.std or 'missing', '\' or script: \'', args.script or 'missing', '\''}), args.nocat);
		end
	end
	
	if is_set (args.lit) then													-- translation (not supported in {{lang}})
		table.insert (out, ', <small>');
		if 'no' == args.link then
			table.insert (out, '<abbr title="literal translation">lit.</abbr>');
		else
			table.insert (out, make_wikilink ('Literal translation', 'lit.'));
		end
		table.insert (out, "&nbsp;</small>'");
		table.insert (out, args.lit);
		table.insert (out, "'");
	end
	
	table.insert (out, make_category (args.code, language_name, args.nocat));
	if 0 == namespace then																		-- args.nocat intentionally ignored here
		table.insert (out, '[[Category:Lang and Lang-xx templates using Module:Lang]]');		-- keep until lang and all lang-xx templates converted to the module
	end
	return table.concat (out);													-- put it all together and done
end

return p;