-- *************************************************************** -- -- Parse HTML 4.01 strict (mostly) -- Copyright 2020 by Sean Conner. All Rights Reserved. -- -- This library is free software; you can redistribute it and/or modify it -- under the terms of the GNU Lesser General Public License as published by -- the Free Software Foundation; either version 3 of the License, or (at your -- option) any later version. -- -- This library is distributed in the hope that it will be useful, but -- WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -- License for more details. -- -- You should have received a copy of the GNU Lesser General Public License -- along with this library; if not, see . -- -- Comments, questions and criticisms can be sent to: sean@conman.org -- -- ******************************************************************** -- luacheck: ignore 611 631 -- HTML 4.01 strict local ENTITIES = require "org.conman.const.entity" local lpeg = require "lpeg" local Carg = lpeg.Carg local Cmt = lpeg.Cmt local Cc = lpeg.Cc local Cf = lpeg.Cf local Cg = lpeg.Cg local Cs = lpeg.Cs local Ct = lpeg.Ct local C = lpeg.C local P = lpeg.P local R = lpeg.R local S = lpeg.S local V = lpeg.V local EMPTY = P(true) local WS = S" \t\r\n" local CHAR = P"&#" * C(R"09"^1) * P";" / utf8.char + P"&" * C(R("az","AZ","09")^1) * P";" / ENTITIES + Cmt( -- This exists JUST to support the PRE tag. C(S" \t\r\n"^1 * Carg(1)), function(_,position,cap,state) return position,state.pre and cap or " " end ) + P(1) local CHARa = P"&#" * C(R"09"^1) * P";" / utf8.char + P"&" * C(R("az","AZ","09")^1) * P";" / ENTITIES + S" \t\r\n"^1 / " " + P(1) -- ************************************************************************* local Hc,H do local char = R("AZ","az") / function(c) return P(c:lower()) + P(c:upper()) end + P(1) / function(c) return P(c) end local cis = Cf(char^1,function(a,b) return a * b end) Hc = function(s) return cis:match(s) / s end H = function(s) return cis:match(s) end end -- ************************************************************************* local function tagi(name,attrib,body,optclose) local open = P"<" * Cg(Hc(name),'tag') * #S" \t\r\n>" * Cg( Cf(Ct"" * Cg(attrib)^0,function(t,n,v) t[n] = v return t end), 'attributes' ) * Cg(Cc(true),'inline') * WS^0 * P">" local close = P"" if optclose then close = close^-1 end return Ct(open * body * close) end -- ************************************************************************* local function tagb(name,attrib,body,optclose,optopen) local otag,ctag do if name == 'pre' then otag = Cmt( C(H(name) * #S" \t\r\n>" * Carg(1)), function(_,position,capture,state) state.pre = true return position,capture:lower() end ) ctag = Cmt( C(H(name) * Carg(1)), function(_,position,_,state) state.pre = false return position end ) else otag = Hc(name) ctag = H(name) end end local open = WS^0 * P"<" * Cg(otag,'tag') * #S" \t\r\n>" * Cg( Cf(Ct"" * Cg(attrib)^0,function(t,n,v) t[n] = v return t end), 'attributes' ) * Cg(Cc(true),'block') * WS^0 * P">" * WS^0 local close = P"" * WS^0 if optclose then close = close^-1 end if optopen then return Ct((open + Cg(Cc(name),'tag') * Cg(Ct"",'attributes') * Cg(Cc(true),'block')) * body * close) else return Ct(open * body * close) end end -- ************************************************************************* local function attribute(name,value) local this = S" \t\r\n"^1 * Hc(name) * #S" \t\r\n=>" local that do if value then that = P"'" * value * P"'" + P'"' * value * P'"' + value else that = P"'" * Cs((CHARa - P"'")^0) * P"'" + P'"' * Cs((CHARa - P'"')^0) * P'"' + Cs((CHARa - S" \t\r\n>")^0) end end return this * (S" \t\r\n"^0 * P'=' * S" \t\r\n"^0 * that + Cc"") end -- ************************************************************************* local PCDATA = Cs((CHAR - P"<")^1) local Cinline = Ct(P"")^0,"comment") * P"-->") local Cblock = Ct(P"")^0,"comment") * P"-->") * WS^0 local ATTR = { abbr = attribute('abbr'), -- Text accept = attribute('accept'), -- ContentType accept_charset = attribute('accept_charset'), -- Charsets accesskey = attribute('accesskey'), -- Character action = attribute('action'), -- URI align = attribute('align',Hc'left' + Hc'center' + Hc'right' + Hc'justiry' + Hc'char'), allowfullscreen = attribute('allowfullscreen',Hc'true'), allowscriptaccess = attribute('allowscriptaccess'), -- ??? alt = attribute('alt'), -- Text or CDATA archive = attribute('archive'), -- CDATA axis = attribute('axis'), -- CDATA bgcolor = attribute('bgcolor'), -- Color border = attribute('border'), -- Pixels cellpadding = attribute('cellpadding'), -- Length cellspacing = attribute('cellspacing'), -- Length char = attribute('char'), -- Character charoff = attribute('charoff'), -- Length charset = attribute('charset'), -- Charset checked = attribute('checked',Hc'checked'), cite = attribute('cite'), -- URI class = attribute('class'), -- CDATA classid = attribute('classid'), -- URI codebase = attribute('codebase'), -- URI codetype = attribute('codetype'), -- ContentType color = attribute('color'), -- Color cols = attribute('cols',R"09"^1/tonumber), colspan = attribute('colspan',R"09"^1/tonumber), content = attribute('content'), -- CDATA coords = attribute('coords'), -- Coords data = attribute('data'), -- URI datafld = attribute('datafld'), -- CDATA datapagesize = attribute('datapasesize'), -- CDATA datasrc = attribute('datasrc'), -- URI datetime = attribute('datetime'), -- Datetime declare = attribute('declare',Hc'declare'), defer = attribute('defer',Hc'defer'), dir = attribute('dir', Hc'ltr' + Hc'rtl'), disabled = attribute('disabled',Hc'disable'), enctype = attribute('enctype'), -- ContentType event = attribute('event'), -- CDATA face = attribute('face'), -- CDATA flashvars = attribute('flashvars'), -- ??? forr = attribute('for'), -- URI frame = attribute('frame',Hc'void' + Hc'above' + Hc'below' + Hc'hsides' + Hc'lhs' + Hc'rhs' + Hc'vsides' + Hc'box' + Hc'border'), headers = attribute('headers'), -- IDREFS height = attribute('height'), -- Length href = attribute('href'), -- URI hreflang = attribute('hreflang'), -- LanguageCode http_equiv = attribute('http-equiv'), -- NAME id = attribute('id'), -- ID ismap = attribute('ismap',Hc'ismap'), label = attribute('label'), -- Text lang = attribute('lang'), -- LanguageCode longdesc = attribute('longdesc'), -- URI maxlength = attribute('maxlength',R"09"^1/tonumber), media = attribute('media'), -- MediaDesc method = attribute('method',Hc'GET' + Hc'POST'), multiple = attribute('multiple',Hc'multiple'), name = attribute('name'), -- CDATA nohref = attribute('nohref',Hc'nohref'), onblur = attribute('onblur'), -- Script onchange = attribute('onchange'), -- Script onclick = attribute('onclick'), -- Script ondblclick = attribute('ondblclick'), -- Script onfocus = attribute('onfocus'), -- Script onkeydown = attribute('onkeydown'), -- Script onkeypress = attribute('onkeypress'), -- Script onkeyup = attribute('onkeyup'), -- Script onmousedown = attribute('onmousedown'), -- Script onmousemove = attribute('onmousemove'), -- Script onmouseout = attribute('onmouseout'), -- Script onmouseover = attribute('onmouseover'), -- Script onmouseup = attribute('onmouseup'), -- Script onreset = attribute('onreset'), -- Script onselect = attribute('onselect'), -- Script onsubmit = attribute('onsubmit'), -- Script pluginspage = attribute('pluginspage'), -- URI profile = attribute('profile'), -- URI quality = attribute('quality'), -- ??? readonly = attribute('readonly',Hc'readonly'), rel = attribute('rel'), -- LinkTypes rev = attribute('rev'), -- LinkTypes rows = attribute('rows',R"09"^1/tonumber), rowspan = attribute('rowspan',R"09"^1/tonumber), rules = attribute('rules',Hc'none' + Hc'groups' + Hc'rows' + Hc'cols' + Hc'all'), scheme = attribute('scheme'), -- CDATA scope = attribute('scope'), -- Scope selected = attribute('selected',Hc'selected'), shape = attribute('shape'), -- Shape size = attribute('size'), -- CDATA span = attribute('span',R"09"^1/tonumber), src = attribute('src'), -- URI standby = attribute('standby'), -- Text start = attribute('start',R"09"^1 / tonumber), -- XXX non standard style = attribute('style'), -- StyleSheet summary = attribute('summary'), -- Text tabindex = attribute('tabindex',R"09"^1/tonumber), title = attribute('title'), -- Text type = attribute('type'), -- ContentType type2 = attribute('type',Hc'button' + Hc'submit' + Hc'reset'), usemap = attribute('usemap'), -- URI valign = attribute('valign',Hc'top' + Hc'middle' + Hc'bottom' + Hc'baseline'), value = attribute('value'), -- CDATA valuetype = attribute('valuetype',Hc'DATA' + Hc'REF' + Hc'OBJECT'), width = attribute('width'), -- Length } local coreattrs = ATTR.id + ATTR.class + ATTR.style + ATTR.title local i18n = ATTR.lang + ATTR.dir local events = ATTR.onclick + ATTR.ondblclick + ATTR.onmousedown + ATTR.onmouseup + ATTR.onmouseover + ATTR.onmousemove + ATTR.onmouseout + ATTR.onkeypress + ATTR.onkeydown + ATTR.onkeyup local reserved = ATTR.datasrc + ATTR.datafld local attrs = coreattrs + i18n + events local cellhalign = ATTR.align + ATTR.char + ATTR.charoff local cellvalign = ATTR.valign local A_attr = attrs + ATTR.charset + ATTR.type + ATTR.name + ATTR.href + ATTR.hreflang + ATTR.rel + ATTR.rev + ATTR.accesskey + ATTR.shape + ATTR.coords + ATTR.tabindex + ATTR.onfocus + ATTR.onblur local IMG_attr = attrs + ATTR.src + ATTR.alt + ATTR.longdesc + ATTR.name + ATTR.height + ATTR.width + ATTR.usemap + ATTR.ismap local SCRIPT_attr = events + ATTR.charset + ATTR.type + ATTR.src + ATTR.defer + ATTR.forr local AREA_attr = attrs + ATTR.shape + ATTR.coords + ATTR.href + ATTR.nohref + ATTR.alt + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus + ATTR.onblur local OBJECT_attr = attrs + ATTR.declare + ATTR.classid + ATTR.codebase + ATTR.data + ATTR.type + ATTR.codetype + ATTR.archive + ATTR.standby + ATTR.height + ATTR.width + ATTR.usemap + ATTR.name + ATTR.tabindex + reserved local PARAM_attr = ATTR.id + ATTR.name + ATTR.value + ATTR.valuetype + ATTR.type local EMBED_attr = attrs + ATTR.align + ATTR.allowfullscreen + ATTR.allowscriptaccess + ATTR.bgcolor + ATTR.flashvars + ATTR.height + ATTR.href + ATTR.pluginspage + ATTR.quality + ATTR.src + ATTR.type + ATTR.width local FONT_attr = coreattrs + i18n + ATTR.size + ATTR.color + ATTR.face local INPUT_attr = attrs + ATTR.type + ATTR.name + ATTR.value + ATTR.checked + ATTR.disabled + ATTR.readonly + ATTR.size + ATTR.maxlength + ATTR.src + ATTR.alt + ATTR.usemap + ATTR.ismap + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus + ATTR.onblur + ATTR.onselect + ATTR.onchange + ATTR.accept + reserved local SELECT_attr = attrs + ATTR.name + ATTR.size + ATTR.multiple + ATTR.disabled + ATTR.tabindex + ATTR.onfocus + ATTR.onblur + ATTR.onchange + reserved local TEXTAREA_attr = attrs + ATTR.name + ATTR.rows + ATTR.cols + ATTR.disabled + ATTR.readonly + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus + ATTR.onblur + ATTR.onselect + ATTR.onchange + reserved local LABEL_attr = attrs + ATTR.forr + ATTR.accesskey + ATTR.onfocus + ATTR.onblur local BUTTON_attr = attrs + ATTR.name + ATTR.value + ATTR.type2 + ATTR.disabled + ATTR.tabindex + ATTR.accesskey + ATTR.onfocus + ATTR.onblur + reserved local FORM_attr = attrs + ATTR.action + ATTR.method + ATTR.enctype + ATTR.accept + ATTR.name + ATTR.onsubmit + ATTR.onreset + ATTR.accept_charset local TABLE_attr = attrs + ATTR.summary + ATTR.width + ATTR.border + ATTR.frame + ATTR.rules + ATTR.cellspacing + ATTR.cellpadding + ATTR.datapagesize + reserved local THD_attr = attrs + ATTR.abbr + ATTR.axis + ATTR.headers + ATTR.scope + ATTR.rowspan + ATTR.colspan + cellhalign + cellvalign -- ************************************************************************* local parse_tags = P { 'BODY', BODY = V'flow', -- XXX flow = V'block' + V'inline', --======================================================================= inline = V'fontstyle' + V'phrase' + V'special' + V'iINS' + V'formctrl' + V'iDEL' + Cinline + PCDATA, fontstyle = V'TT' + V'I' + V'B' + V'BIG' + V'SMALL' + V'U', TT = tagi('tt' , attrs , V'inline'^0), I = tagi('i' , attrs , V'inline'^0), B = tagi('b' , attrs , V'inline'^0), BIG = tagi('big' , attrs , V'inline'^0), SMALL = tagi('small' , attrs , V'inline'^0), U = tagi('u' , attrs , V'inline'^0), -- XXX non-standard phrase = V'EM' + V'STRONG' + V'DFN' + V'CODE' + V'SAMP' + V'KBD' + V'VAR' + V'CITE' + V'ABBR' + V'ACRONYM', EM = tagi('em' , attrs , V'inline'^0), STRONG = tagi('strong' , attrs , V'inline'^0), DFN = tagi('dfn' , attrs , V'inline'^0), CODE = tagi('code' , attrs , V'inline'^0), SAMP = tagi('samp' , attrs , V'inline'^0), KBD = tagi('kbd' , attrs , V'inline'^0), VAR = tagi('var' , attrs , V'inline'^0), CITE = tagi('cite' , attrs , V'inline'^0), ABBR = tagi('abbr' , attrs , V'inline'^0), ACRONYM = tagi('acronym', attrs , V'inline'^0), special = V'A' + V'IMG' + V'BR' + V'SCRIPT' + V'BDO' + V'Q' + V'SUB' + V'SUP' + V'SPAN' + V'OBJECT' + V'FONT' + V'MAP', A = tagi('a' , A_attr , (V'inline' - V'A')^0), IMG = tagi('img' , IMG_attr , EMPTY,true), BR = tagi('br' , coreattrs , EMPTY,true), SCRIPT = tagi('script' , SCRIPT_attr , Cs((CHAR - (P""))^0)), -- Script Q = tagi('q' , (attrs + ATTR.cite),V'inline'^0), SUB = tagi('sub' , attrs , V'inline'^0), SUP = tagi('sup' , attrs , V'inline'^0), SPAN = tagi('span' , (attrs + reserved) , V'inline'^0), BDO = tagi('bdo' , (coreattrs + ATTR.lang + ATTR.dir),V'inline'^0), MAP = tagi('map' , (attrs + ATTR.name) , (V'block' + V'AREA' + WS)^1), AREA = tagi('area' , AREA_attr , EMPTY , true), OBJECT = tagi('object' , OBJECT_attr , (V'PARAM' + V'EMBED' + V'flow')^0), PARAM = tagi('param' , PARAM_attr , EMPTY , true), EMBED = tagi('embed' , EMBED_attr , V'inline'^0), -- XXX non-standard FONT = tagi('font' , FONT_attr , V'inline'^0), -- XXX non-standard formctrl = V'INPUT' + V'SELECT' + V'TEXTAREA' + V'LABEL' + V'BUTTON', INPUT = tagi('input' , INPUT_attr , EMPTY,true), SELECT = tagi('select' , SELECT_attr , (V'OPTGROUP' + V'OPTION' + WS)^1), TEXTAREA = tagi('textarea' , TEXTAREA_attr , PCDATA^0), LABEL = tagi('label' , LABEL_attr , (V'inline' - V'LABEL')^0), BUTTON = tagi('button' , BUTTON_attr , (V'flow' - (V'A' + V'formctrl' + V'FORM' + V'FIELDSET'))^0), OPTGROUP = tagi('optgroup' , (attrs + ATTR.disabled + ATTR.label),(V'OPTION' + WS)^1), OPTION = tagi('option' , (attrs + ATTR.selected + ATTR.disabled + ATTR.label + ATTR.value),PCDATA^0,true), iINS = tagi('ins' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0), iDEL = tagi('del' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0), --======================================================================= block = V'P' + V'PRE' + V'DL' + V'NOSCRIPT' + V'BLOCKQUOTE' + V'FORM' + V'HR' + V'TABLE' + V'FIELDSET' + V'ADDRESS' + V'H1' + V'H2' + V'H3' + V'H4' + V'H5' + V'H6' + V'UL' + V'OL' + V'DIV' + V'bINS' + V'bDEL' + Cblock , pre_exclude = V'IMG' + V'OBJECT' + V'BIG' + V'SMALL' + V'SUB' + V'SUP', P = tagb('p' , attrs , V'inline'^0,true), PRE = tagb('pre' , attrs , (V'inline' - V'pre_exclude')^0), BLOCKQUOTE = tagb('blockquote' , (attrs + ATTR.cite),(V'block' + V'SCRIPT')^1), HR = tagb('hr' , attrs , EMPTY,true), ADDRESS = tagb('address' , attrs , V'inline'^0), H1 = tagb('h1' , attrs , V'inline'^0), H2 = tagb('h2' , attrs , V'inline'^0), H3 = tagb('h3' , attrs , V'inline'^0), H4 = tagb('h4' , attrs , V'inline'^0), H5 = tagb('h5' , attrs , V'inline'^0), H6 = tagb('h6' , attrs , V'inline'^0), DIV = tagb('div' , (attrs + reserved),V'flow'^0), DL = tagb('dl' , attrs , (V'DT' + V'DD')^1), DT = tagb('dt' , attrs , V'inline'^0,true), DD = tagb('dd' , attrs , V'flow'^0,true), UL = tagb('ul' , attrs , (V'LI' + Cblock)^1), OL = tagb('ol' , attrs + ATTR.start + ATTR.type, (V'LI' + Cblock)^1), -- XXX non-standard LI = tagb('li' , attrs , V'flow'^0,true), NOSCRIPT = tagb('noscript' , attrs , V'block'^1), FORM = tagb('form' , FORM_attr , ((V'block' + V'SCRIPT') - V'FORM')^0), FIELDSET = tagb('fieldset' , attrs , (V'LEGEND' + V'flow' + PCDATA)^0), LEGEND = tagb('legend' , (attrs + ATTR.accesskey),V'inline'^0), TABLE = tagb('table' , TABLE_attr,V'CAPTION'^-1 * (V'COL' + V'COLGROUP')^0 * V'THEAD'^-1 * V'TFOOT'^-1 * V'TBODY'^1), CAPTION = tagb('caption' , attrs,V'inline'^0), COL = tagb('col' , (attrs + ATTR.span + ATTR.width + cellhalign + cellvalign),EMPTY,true), COLGROUP = tagb('colgroup' , (attrs + ATTR.span + ATTR.width + cellhalign + cellvalign),V'COL'^0,true), THEAD = tagb('thead' , (attrs + cellhalign + cellvalign) , V'TR'^1,true), TFOOT = tagb('tfoot' , (attrs + cellhalign + cellvalign) , V'TR'^1,true), TBODY = tagb('tbody' , (attrs + cellhalign + cellvalign) , V'TR'^1,true,true), TR = tagb('tr' , (attrs + cellhalign + cellvalign) , (V'TH' + V'TD')^1,true), TH = tagb('th' , THD_attr , V'flow'^0,true), TD = tagb('td' , THD_attr , V'flow'^0,true), bINS = tagb('ins' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0), bDEL = tagb('del' , (attrs + ATTR.cite + ATTR.datetime),V'flow'^0), } -- ************************************************************************* return Ct(parse_tags^1) * lpeg.Cp()