gHtmlParser.cpp

Go to the documentation of this file.
00001 // gHtmlParser, main part for libgdhtml
00002 
00003 #include <string.h>
00004 #include "gHtmlParser.h"
00005 #include "gHtmlCtrl.h"
00006 #include "gHtmlSeq.h"
00007 #include "gstringext.h"
00008 
00009 ////////////////////////////////////////////////////////////
00010 // Static members
00011 const char* gHParsed::tblStateStr[e_HS_Last]={
00012         "START",
00013         "START_",
00014         "HTML",
00015         "HEAD",
00016         "HEAD_",
00017         "BODY",
00018         "BODY_",
00019         "HTML_"};
00020 
00021 t_int16 gHtmlParser::nElems=-1;
00022 sHtmlElement gHtmlParser::lElems[]={
00023         {0, "A",        "anchor",       '@',    '@',    '@',    '@',    '@'},  // 0
00024         {0, "ABBR",     "abbreviated form",     '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00025         {0, "ACRONYM",  "acronym",      '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00026         {0, "ADDRESS",  "information on author",        '@',    '@',    '@',    '@',    '@'},
00027         {0, "APPLET",   "Java applet",  '@',    '@',    'D',    'L',    '@'},
00028         {0, "AREA",     "client-side image map",        '@',    'F',    '@',    '@',    '@'},
00029         {0, "B",        "bold text style",      '@',    '@',    '@',    '@',    '@'},
00030         {0, "BASE",     "document base URI",    '@',    'F',    '@',    '@',    '@'},  // 7
00031         {0, "BASEFONT", "base font size",       '@',    'F',    'D',    'L',    '@'},
00032         {0, "BDO",      "I18N BiDi over-ride",  '@',    '@',    '@',    '@',    '@'},
00033         {0, "BIG",      "large text style",     '@',    '@',    '@',    '@',    '@'},
00034         {0, "BLOCKQUOTE",       "long quotation",       '@',    '@',    '@',    '@',    '@'},
00035         {0, "BODY",     "document body",        'O',    'O',    '@',    '@',    '@'},  // 12
00036         {0, "BR",       "forced line break",    '@',    'F',    '@',    '@',    '@'},
00037         {0, "BUTTON",   "push button",  '@',    '@',    '@',    '@',    '@'},
00038         {0, "CAPTION",  "table caption",        '@',    '@',    '@',    '@',    '@'},
00039         {0, "CENTER",   "center line (deprecated)",     '@',    '@',    'D',    'L',    '@'},
00040         {0, "CITE",     "citation",     '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00041         {0, "CODE",     "computer code fragment",       '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00042         {0, "COL",      "table column", '@',    'F',    '@',    '@',    '@'},
00043         {0, "COLGROUP", "table column group",   '@',    'O',    '@',    '@',    '@'},
00044         {0, "DD",       "definition description",       '@',    'O',    '@',    '@',    '@'},
00045         {0, "DEL",      "deleted text", '@',    '@',    '@',    '@',    '@'},
00046         {0, "DFN",      "instance definition",  '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00047         {0, "DIR",      "directory list",       '@',    '@',    'D',    'L',    '@'},
00048         {0, "DIV",      "generic language/style",       '@',    '@',    '@',    '@',    '@'},
00049         {0, "DL",       "definition list",      '@',    '@',    '@',    '@',    '@'},
00050         {0, "DT",       "definition term",      '@',    'O',    '@',    '@',    '@'},
00051         {0, "EM",       "emphasis",     '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00052         {0, "FIELDSET", "form control group",   '@',    '@',    '@',    '@',    '@'},
00053         {0, "FONT",     "local change to font", '@',    '@',    'D',    'L',    '@'},
00054         {0, "FORM",     "interactive form",     '@',    '@',    '@',    '@',    '@'},
00055         {0, "FRAME",    "subwindow",    '@',    'F',    '@',    'F',    '@'},
00056         {0, "FRAMESET", "window subdivision",   '@',    '@',    '@',    'F',    '@'},
00057         {0, "H1",       "heading",      '@',    '@',    '@',    '@',    '@'},
00058         {0, "H2",       "heading",      '@',    '@',    '@',    '@',    '@'},
00059         {0, "H3",       "heading",      '@',    '@',    '@',    '@',    '@'},
00060         {0, "H4",       "heading",      '@',    '@',    '@',    '@',    '@'},
00061         {0, "H5",       "heading",      '@',    '@',    '@',    '@',    '@'},
00062         {0, "H6",       "heading",      '@',    '@',    '@',    '@',    '@'},
00063         {0, "HEAD",     "document head",        'O',    'O',    '@',    '@',    '@'},  // 40
00064         {0, "HR",       "horizontal rule",      '@',    'F',    '@',    '@',    '@'},
00065         {0, "HTML",     "document root element",        'O',    'O',    '@',    '@',    '@'},  // 42
00066         {0, "I",        "italic text style",    '@',    '@',    '@',    '@',    '@'},
00067         {0, "IFRAME",   "inline subwindow",     '@',    '@',    '@',    'L',    '@'},
00068         {0, "IMG",      "Embedded image",       '@',    'F',    '@',    '@',    '@'},
00069         {0, "INPUT",    "form control", '@',    'F',    '@',    '@',    '@'},
00070         {0, "INS",      "inserted text",        '@',    '@',    '@',    '@',    '@'},
00071         {0, "ISINDEX",  "single line prompt",   '@',    'F',    'D',    'L',    '@'},
00072         {0, "KBD",      "text to be entered by user",   '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00073         {0, "LABEL",    "form field label text",        '@',    '@',    '@',    '@',    '@'},
00074         {0, "LEGEND",   "fieldset legend",      '@',    '@',    '@',    '@',    '@'},
00075         {0, "LI",       "list item",    '@',    'O',    '@',    '@',    '@'},
00076         {0, "LINK",     "media-independent link",       '@',    'F',    '@',    '@',    '@'},
00077         {0, "MAP",      "client-side image map",        '@',    '@',    '@',    '@',    '@'},
00078         {0, "MENU",     "menu list",    '@',    '@',    'D',    'L',    '@'},
00079         {0, "META",     "generic meta info",    '@',    'F',    '@',    '@',    '@'},
00080         {0, "NOFRAMES", "alternate content",    '@',    '@',    '@',    'F',    '@'},
00081         {0, "NOSCRIPT", "alternate content",    '@',    '@',    '@',    '@',    '@'},
00082         {0, "OBJECT",   "generic embedded object",      '@',    '@',    '@',    '@',    '@'},
00083         {0, "OL",       "ordered list", '@',    '@',    '@',    '@',    '@'},
00084         {0, "OPTGROUP", "option group", '@',    '@',    '@',    '@',    '@'},
00085         {0, "OPTION",   "selectable choice",    '@',    'O',    '@',    '@',    '@'},
00086         {0, "P",        "paragraph",    '@',    'O',    '@',    '@',    '@'},
00087         {0, "PARAM",    "named property value", '@',    'F',    '@',    '@',    '@'},
00088         {0, "PRE",      "preformatted text",    '@',    '@',    '@',    '@',    '@'},
00089         {0, "Q",        "short inline quotation",       '@',    '@',    '@',    '@',    '@'},
00090         {0, "S",        "strike-through text style",    '@',    '@',    'D',    'L',    '@'},
00091         {0, "SAMP",     "sample program output",        '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00092         {0, "SCRIPT",   "script statements",    '@',    '@',    '@',    '@',    '@'},
00093         {0, "SELECT",   "option selector",      '@',    '@',    '@',    '@',    '@'},
00094         {0, "SMALL",    "small text style",     '@',    '@',    '@',    '@',    '@'},
00095         {0, "SPAN",     "generic language/style",       '@',    '@',    '@',    '@',    '@'},
00096         {0, "STRIKE",   "strike-through text",  '@',    '@',    'D',    'L',    '@'},
00097         {0, "STRONG",   "strong emphasis",      '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00098         {0, "STYLE",    "style info",   '@',    '@',    '@',    '@',    '@'},
00099         {0, "SUB",      "subscript",    '@',    '@',    '@',    '@',    '@'},
00100         {0, "SUP",      "superscript",  '@',    '@',    '@',    '@',    '@'},
00101         {0, "TABLE",    "table",        '@',    '@',    '@',    '@',    '@'},
00102         {0, "TBODY",    "table body",   'O',    'O',    '@',    '@',    '@'},
00103         {0, "TD",       "table data cell",      '@',    'O',    '@',    '@',    '@'},
00104         {0, "TEXTAREA", "multi-line text field",        '@',    '@',    '@',    '@',    '@'},
00105         {0, "TFOOT",    "table footer", '@',    'O',    '@',    '@',    '@'},
00106         {0, "TH",       "table header cell",    '@',    'O',    '@',    '@',    '@'},
00107         {0, "THEAD",    "table header", '@',    'O',    '@',    '@',    '@'},
00108         {0, "TITLE",    "document title",       '@',    '@',    '@',    '@',    '@'},
00109         {0, "TR",       "table row",    '@',    'O',    '@',    '@',    '@'},
00110         {0, "TT",       "teletype or monospaced",       '@',    '@',    '@',    '@',    '@'},
00111         {0, "U",        "underlined text style",        '@',    '@',    'D',    'L',    '@'},
00112         {0, "UL",       "unordered list",       '@',    '@',    '@',    '@',    '@'},
00113         {0, "VAR",      "instance of a variable",       '@',    '@',    '@',    '@',    e_HtmlElementPhrase},
00114         {0, NULL,       NULL,   '@',    '@',    '@',    '@',    '\0'}};
00115 
00116 
00117 sAttrDef gHtmlParser::lAttrs[]={
00118         {0, "abbr",     "TD, TH",       "%Text;",       "#I",   'n',    "",     "abbreviation for header cell"},
00119         {0, "accept-charset",   "FORM", "%Charsets;",   "#I",   'n',    "",     "list of supported charsets"},
00120         {0, "accept",   "FORM, INPUT",  "%ContentTypes;",       "#I",   'n',    "",     "list of MIME types for file upload"},
00121         {0, "accesskey",        "A, AREA, BUTTON, INPUT, LABEL, LEGEND, TEXTAREA",      "%Character;",  "#I",   'n',    "",     "accessibility key character"},
00122         {0, "action",   "FORM", "%URI;",        "#R",   'n',    "",     "server-side form handler"},
00123         {0, "align",    "CAPTION",      "%CAlign;",     "#I",   'D',    "L",    "relative to table"},
00124         {0, "align",    "APPLET, IFRAME, IMG, INPUT, OBJECT",   "%IAlign;",     "#I",   'D',    "L",    "vertical or horizontal alignment"},
00125         {0, "align",    "LEGEND",       "%LAlign;",     "#I",   'D',    "L",    "relative to fieldset"},
00126         {0, "align",    "TABLE",        "%TAlign;",     "#I",   'D',    "L",    "table position relative to window"},
00127         {0, "align",    "HR",   "(left|center|right)",  "#I",   'D',    "L",    ""},
00128         {0, "align",    "DIV, H1, H2, H3, H4, H5, H6, P",       "(left|center|right|justify)",  "#I",   'D',    "L",    "align, text alignment"},
00129         {0, "align",    "COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR",       "(left|center|right|justify|char)",     "#I",   'n',    "",     ""},
00130         {0, "alink",    "BODY", "%Color;",      "#I",   'D',    "L",    "color of selected links"},
00131         {0, "alt",      "APPLET",       "%Text;",       "#I",   'D',    "L",    "short description"},
00132         {0, "alt",      "AREA, IMG",    "%Text;",       "#R",   'n',    "",     "short description"},
00133         {0, "alt",      "INPUT",        "CDATA",        "#I",   'n',    "",     "short description"},
00134         {0, "archive",  "APPLET",       "CDATA",        "#I",   'D',    "L",    "comma-separated archive list"},
00135         {0, "archive",  "OBJECT",       "CDATA",        "#I",   'n',    "",     "space-separated list of URIs"},
00136         {0, "axis",     "TD, TH",       "CDATA",        "#I",   'n',    "",     "comma-separated list of related headers"},
00137         {0, "background",       "BODY", "%URI;",        "#I",   'D',    "L",    "texture tile for document background"},
00138         {0, "bgcolor",  "TABLE",        "%Color;",      "#I",   'D',    "L",    "background color for cells"},
00139         {0, "bgcolor",  "TR",   "%Color;",      "#I",   'D',    "L",    "background color for row"},
00140         {0, "bgcolor",  "TD, TH",       "%Color;",      "#I",   'D',    "L",    "cell background color"},
00141         {0, "bgcolor",  "BODY", "%Color;",      "#I",   'D',    "L",    "document background color"},
00142         {0, "border",   "TABLE",        "%Pixels;",     "#I",   'n',    "",     "controls frame width around table"},
00143         {0, "border",   "IMG, OBJECT",  "%Pixels;",     "#I",   'D',    "L",    "link border width"},
00144         {0, "cellpadding",      "TABLE",        "%Length;",     "#I",   'n',    "",     "spacing within cells"},
00145         {0, "cellspacing",      "TABLE",        "%Length;",     "#I",   'n',    "",     "spacing between cells"},
00146         {0, "char",     "COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR",       "%Character;",  "#I",   'n',    "",     "alignment char, e.g. char :"},
00147         {0, "charoff",  "COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR",       "%Length;",     "#I",   'n',    "",     "offset for alignment char"},
00148         {0, "charset",  "A, LINK, SCRIPT",      "%Charset;",    "#I",   'n',    "",     "char encoding of linked resource"},
00149         {0, "checked",  "INPUT",        "(checked)",    "#I",   'n',    "",     "for radio buttons and check boxes"},
00150         {0, "cite",     "BLOCKQUOTE, Q",        "%URI;",        "#I",   'n',    "",     "URI for source document or msg"},
00151         {0, "cite",     "DEL, INS",     "%URI;",        "#I",   'n',    "",     "info on reason for change"},
00152         {0, "class",    "@BASE, BASEFONT, HEAD, HTML, META, PARAM, SCRIPT, STYLE, TITLE",       "CDATA",        "#I",   'n',    "",     "space-separated list of classes"},
00153         {0, "classid",  "OBJECT",       "%URI;",        "#I",   'n',    "",     "identifies an implementation"},
00154         {0, "clear",    "BR",   "(left|all|right|none)",        "none", 'D',    "L",    "control of text flow"},
00155         {0, "code",     "APPLET",       "CDATA",        "#I",   'D',    "L",    "applet class file"},
00156         {0, "codebase", "OBJECT",       "%URI;",        "#I",   'n',    "",     "base URI for classid, data, archive"},
00157         {0, "codebase", "APPLET",       "%URI;",        "#I",   'D',    "L",    "optional base URI for applet"},
00158         {0, "codetype", "OBJECT",       "%ContentType;",        "#I",   'n',    "",     "content type for code"},
00159         {0, "color",    "BASEFONT, FONT",       "%Color;",      "#I",   'D',    "L",    "text color"},
00160         {0, "cols",     "FRAMESET",     "%MultiLengths;",       "#I",   'n',    "F",    "list of lengths, default: 100% (1 col)"},
00161         {0, "cols",     "TEXTAREA",     "NUMBER",       "#R",   'n',    "",     ""},
00162         {0, "colspan",  "TD, TH",       "NUMBER",       "1",    'n',    "",     "number of cols spanned by cell"},
00163         {0, "compact",  "DIR, DL, MENU, OL, UL",        "(compact)",    "#I",   'D',    "L",    "reduced interitem spacing"},
00164         {0, "content",  "META", "CDATA",        "#R",   'n',    "",     "associated information"},
00165         {0, "coords",   "AREA", "%Coords;",     "#I",   'n',    "",     "comma-separated list of lengths"},
00166         {0, "coords",   "A",    "%Coords;",     "#I",   'n',    "",     "for use with client-side image maps"},
00167         {0, "data",     "OBJECT",       "%URI;",        "#I",   'n',    "",     "reference to object$s data"},
00168         {0, "datetime", "DEL, INS",     "%Datetime;",   "#I",   'n',    "",     "date and time of change"},
00169         {0, "declare",  "OBJECT",       "(declare)",    "#I",   'n',    "",     "declare but don$t instantiate flag"},
00170         {0, "defer",    "SCRIPT",       "(defer)",      "#I",   'n',    "",     "UA may defer execution of script"},
00171         {0, "dir",      "@APPLET, BASE, BASEFONT, BDO, BR, FRAME, FRAMESET, IFRAME, PARAM, SCRIPT",     "(ltr|rtl)",    "#I",   'n',    "",     "direction for weak/neutral text"},
00172         {0, "dir",      "BDO",  "(ltr|rtl)",    "#R",   'n',    "",     "directionality"},
00173         {0, "disabled", "BUTTON, INPUT, OPTGROUP, OPTION, SELECT, TEXTAREA",    "(disabled)",   "#I",   'n',    "",     "unavailable in this context"},
00174         {0, "enctype",  "FORM", "%ContentType;",        "application/x-www- form-urlencoded",   'n',    "",     ""},
00175         {0, "face",     "BASEFONT, FONT",       "CDATA",        "#I",   'D',    "L",    "comma-separated list of font names"},
00176         {0, "for",      "LABEL",        "IDREF",        "#I",   'n',    "",     "matches field ID value"},
00177         {0, "frame",    "TABLE",        "%TFrame;",     "#I",   'n',    "",     "which parts of frame to render"},
00178         {0, "frameborder",      "FRAME, IFRAME",        "(1|0)",        "1",    'n',    "F",    "request frame borders?"},
00179         {0, "headers",  "TD, TH",       "IDREFS",       "#I",   'n',    "",     "list of id$s for header cells"},
00180         {0, "height",   "IFRAME",       "%Length;",     "#I",   'f',    "L",    "frame height"},  // f in "Depr." means deprecated frame-set
00181         {0, "height",   "TD, TH",       "%Length;",     "#I",   'D',    "L",    "height for cell"},
00182         {0, "height",   "IMG, OBJECT",  "%Length;",     "#I",   'n',    "",     "override height"},
00183         {0, "height",   "APPLET",       "%Length;",     "#R",   'D',    "L",    "initial height"},
00184         {0, "href",     "A, AREA, LINK",        "%URI;",        "#I",   'n',    "",     "URI for linked resource"},
00185         {0, "href",     "BASE", "%URI;",        "#I",   'n',    "",     "URI that acts as base URI"},
00186         {0, "hreflang", "A, LINK",      "%LanguageCode;",       "#I",   'n',    "",     "language code"},
00187         {0, "hspace",   "APPLET, IMG, OBJECT",  "%Pixels;",     "#I",   'D',    "L",    "horizontal gutter"},
00188         {0, "http-equiv",       "META", "NAME", "#I",   'n',    "",     "HTTP response header name"},
00189         {0, "id",       "@BASE, HEAD, HTML, META, SCRIPT, STYLE, TITLE",        "ID",   "#I",   'n',    "",     "document-wide unique id"},
00190         {0, "ismap",    "IMG, INPUT",   "(ismap)",      "#I",   'n',    "",     "use server-side image map"},
00191         {0, "label",    "OPTION",       "%Text;",       "#I",   'n',    "",     "for use in hierarchical menus"},
00192         {0, "label",    "OPTGROUP",     "%Text;",       "#R",   'n',    "",     "for use in hierarchical menus"},
00193         {0, "lang",     "@APPLET, BASE, BASEFONT, BR, FRAME, FRAMESET, IFRAME, PARAM, SCRIPT",  "%LanguageCode;",       "#I",   'n',    "",     "language code"},
00194         {0, "language", "SCRIPT",       "CDATA",        "#I",   'D',    "L",    "predefined script language name"},
00195         {0, "link",     "BODY", "%Color;",      "#I",   'D',    "L",    "color of links"},
00196         {0, "longdesc", "IMG",  "%URI;",        "#I",   'n',    "",     "link to long description (complements alt)"},
00197         {0, "longdesc", "FRAME, IFRAME",        "%URI;",        "#I",   'n',    "F",    "link to long description (complements title)"},
00198         {0, "marginheight",     "FRAME, IFRAME",        "%Pixels;",     "#I",   'n',    "F",    "margin height in pixels"},
00199         {0, "marginwidth",      "FRAME, IFRAME",        "%Pixels;",     "#I",   'n',    "F",    "margin widths in pixels"},
00200         {0, "maxlength",        "INPUT",        "NUMBER",       "#I",   'n',    "",     "max chars for text fields"},
00201         {0, "media",    "STYLE",        "%MediaDesc;",  "#I",   'n',    "",     "designed for use with these media"},
00202         {0, "media",    "LINK", "%MediaDesc;",  "#I",   'n',    "",     "for rendering on these media"},
00203         {0, "method",   "FORM", "(GET|POST)",   "GET",  'n',    "",     "HTTP method used to submit the form"},
00204         {0, "multiple", "SELECT",       "(multiple)",   "#I",   'n',    "",     "default is single selection"},
00205         {0, "name",     "BUTTON, TEXTAREA",     "CDATA",        "#I",   'n',    "",     ""},
00206         {0, "name",     "APPLET",       "CDATA",        "#I",   'D',    "L",    "allows applets to find each other"},
00207         {0, "name",     "SELECT",       "CDATA",        "#I",   'n',    "",     "field name"},
00208         {0, "name",     "FORM", "CDATA",        "#I",   'n',    "",     "name of form for scripting"},
00209         {0, "name",     "FRAME, IFRAME",        "CDATA",        "#I",   'n',    "F",    "name of frame for targetting"},
00210         {0, "name",     "IMG",  "CDATA",        "#I",   'n',    "",     "name of image for scripting"},
00211         {0, "name",     "A",    "CDATA",        "#I",   'n',    "",     "named link end"},
00212         {0, "name",     "INPUT, OBJECT",        "CDATA",        "#I",   'n',    "",     "submit as part of form"},
00213         {0, "name",     "MAP",  "CDATA",        "#R",   'n',    "",     "for reference by usemap"},
00214         {0, "name",     "PARAM",        "CDATA",        "#R",   'n',    "",     "property name"},
00215         {0, "name",     "META", "NAME", "#I",   'n',    "",     "metainformation name"},
00216         {0, "nohref",   "AREA", "(nohref)",     "#I",   'n',    "",     "this region has no action"},
00217         {0, "noresize", "FRAME",        "(noresize)",   "#I",   'n',    "F",    "allow users to resize frames?"},
00218         {0, "noshade",  "HR",   "(noshade)",    "#I",   'D',    "L",    ""},
00219         {0, "nowrap",   "TD, TH",       "(nowrap)",     "#I",   'D',    "L",    "suppress word wrap"},
00220         {0, "object",   "APPLET",       "CDATA",        "#I",   'D',    "L",    "serialized applet file"},
00221         {0, "onblur",   "A, AREA, BUTTON, INPUT, LABEL, SELECT, TEXTAREA",      "%Script;",     "#I",   'n',    "",     "the element lost the focus"},
00222         {0, "onchange", "INPUT, SELECT, TEXTAREA",      "%Script;",     "#I",   'n',    "",     "the element value was changed"},
00223         {0, "onclick",  "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer button was clicked"},
00224         {0, "ondblclick",       "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer button was double clicked"},
00225         {0, "onfocus",  "A, AREA, BUTTON, INPUT, LABEL, SELECT, TEXTAREA",      "%Script;",     "#I",   'n',    "",     "the element got the focus"},
00226         {0, "onkeydown",        "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a key was pressed down"},
00227         {0, "onkeypress",       "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a key was pressed and released"},
00228         {0, "onkeyup",  "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a key was released"},
00229         {0, "onload",   "FRAMESET",     "%Script;",     "#I",   'n',    "F",    "all the frames have been loaded"},
00230         {0, "onload",   "BODY", "%Script;",     "#I",   'n',    "",     "the document has been loaded"},
00231         {0, "onmousedown",      "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer button was pressed down"},
00232         {0, "onmousemove",      "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer was moved within"},
00233         {0, "onmouseout",       "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer was moved away"},
00234         {0, "onmouseover",      "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer was moved onto"},
00235         {0, "onmouseup",        "@APPLET, BASE, BASEFONT, BDO, BR, FONT, FRAME, FRAMESET, HEAD, HTML, IFRAME, ISINDEX, META, PARAM, SCRIPT, STYLE, TITLE",      "%Script;",     "#I",   'n',    "",     "a pointer button was released"},
00236         {0, "onreset",  "FORM", "%Script;",     "#I",   'n',    "",     "the form was reset"},
00237         {0, "onselect", "INPUT, TEXTAREA",      "%Script;",     "#I",   'n',    "",     "some text was selected"},
00238         {0, "onsubmit", "FORM", "%Script;",     "#I",   'n',    "",     "the form was submitted"},
00239         {0, "onunload", "FRAMESET",     "%Script;",     "#I",   'n',    "F",    "all the frames have been removed"},
00240         {0, "onunload", "BODY", "%Script;",     "#I",   'n',    "",     "the document has been removed"},
00241         {0, "profile",  "HEAD", "%URI;",        "#I",   'n',    "",     "named dictionary of meta info"},
00242         {0, "prompt",   "ISINDEX",      "%Text;",       "#I",   'D',    "L",    "prompt message"},
00243         {0, "readonly", "TEXTAREA",     "(readonly)",   "#I",   'n',    "",     ""},
00244         {0, "readonly", "INPUT",        "(readonly)",   "#I",   'n',    "",     "for text and passwd"},
00245         {0, "rel",      "A, LINK",      "%LinkTypes;",  "#I",   'n',    "",     "forward link types"},
00246         {0, "rev",      "A, LINK",      "%LinkTypes;",  "#I",   'n',    "",     "reverse link types"},
00247         {0, "rows",     "FRAMESET",     "%MultiLengths;",       "#I",   'n',    "F",    "list of lengths, default: 100% (1 row)"},
00248         {0, "rows",     "TEXTAREA",     "NUMBER",       "#R",   'n',    "",     ""},
00249         {0, "rowspan",  "TD, TH",       "NUMBER",       "1",    'n',    "",     "number of rows spanned by cell"},
00250         {0, "rules",    "TABLE",        "%TRules;",     "#I",   'n',    "",     "rulings between rows and cols"},
00251         {0, "scheme",   "META", "CDATA",        "#I",   'n',    "",     "select form of content"},
00252         {0, "scope",    "TD, TH",       "%Scope;",      "#I",   'n',    "",     "scope covered by header cells"},
00253         {0, "scrolling",        "FRAME, IFRAME",        "(yes|no|auto)",        "auto", 'n',    "F",    "scrollbar or none"},
00254         {0, "selected", "OPTION",       "(selected)",   "#I",   'n',    "",     ""},
00255         {0, "shape",    "AREA", "%Shape;",      "rect", 'n',    "",     "controls interpretation of coords"},
00256         {0, "shape",    "A",    "%Shape;",      "rect", 'n',    "",     "for use with client-side image maps"},
00257         {0, "size",     "HR",   "%Pixels;",     "#I",   'D',    "L",    ""},
00258         {0, "size",     "FONT", "CDATA",        "#I",   'D',    "L",    "[+|-]nn e.g. size=+1, size=4"},
00259         {0, "size",     "INPUT",        "CDATA",        "#I",   'n',    "",     "specific to each type of field"},
00260         {0, "size",     "BASEFONT",     "CDATA",        "#R",   'D',    "L",    "base font size for FONT elements"},
00261         {0, "size",     "SELECT",       "NUMBER",       "#I",   'n',    "",     "rows visible"},
00262         {0, "span",     "COL",  "NUMBER",       "1",    'n',    "",     "COL attributes affect N columns"},
00263         {0, "span",     "COLGROUP",     "NUMBER",       "1",    'n',    "",     "default number of columns in group"},
00264         {0, "src",      "SCRIPT",       "%URI;",        "#I",   'n',    "",     "URI for an external script"},
00265         {0, "src",      "INPUT",        "%URI;",        "#I",   'n',    "",     "for fields with images"},
00266         {0, "src",      "FRAME, IFRAME",        "%URI;",        "#I",   'n',    "F",    "source of frame content"},
00267         {0, "src",      "IMG",  "%URI;",        "#R",   'n',    "",     "URI of image to embed"},
00268         {0, "standby",  "OBJECT",       "%Text;",       "#I",   'n',    "",     "message to show while loading"},
00269         {0, "start",    "OL",   "NUMBER",       "#I",   'D',    "L",    "starting sequence number"},
00270         {0, "style",    "@BASE, BASEFONT, HEAD, HTML, META, PARAM, SCRIPT, STYLE, TITLE",       "%StyleSheet;", "#I",   'n',    "",     "associated style info"},
00271         {0, "summary",  "TABLE",        "%Text;",       "#I",   'n',    "",     "purpose/structure for speech output"},
00272         {0, "tabindex", "A, AREA, BUTTON, INPUT, OBJECT, SELECT, TEXTAREA",     "NUMBER",       "#I",   'n',    "",     "position in tabbing order"},
00273         {0, "target",   "A, AREA, BASE, FORM, LINK",    "%FrameTarget;",        "#I",   'f',    "L",    "render in this frame"},
00274         {0, "text",     "BODY", "%Color;",      "#I",   'D',    "L",    "document text color"},
00275         {0, "title",    "@BASE, BASEFONT, HEAD, HTML, META, PARAM, SCRIPT, TITLE",      "%Text;",       "#I",   'n',    "",     "advisory title"},
00276         {0, "type",     "A, LINK",      "%ContentType;",        "#I",   'n',    "",     "advisory content type"},
00277         {0, "type",     "OBJECT",       "%ContentType;",        "#I",   'n',    "",     "content type for data"},
00278         {0, "type",     "PARAM",        "%ContentType;",        "#I",   'n',    "",     "content type for value when valuetype=ref"},
00279         {0, "type",     "SCRIPT",       "%ContentType;",        "#R",   'n',    "",     "content type of script language"},
00280         {0, "type",     "STYLE",        "%ContentType;",        "#R",   'n',    "",     "content type of style language"},
00281         {0, "type",     "INPUT",        "%InputType;",  "TEXT", 'n',    "",     "what kind of widget is needed"},
00282         {0, "type",     "LI",   "%LIStyle;",    "#I",   'D',    "L",    "list item style"},
00283         {0, "type",     "OL",   "%OLStyle;",    "#I",   'D',    "L",    "numbering style"},
00284         {0, "type",     "UL",   "%ULStyle;",    "#I",   'D',    "L",    "bullet style"},
00285         {0, "type",     "BUTTON",       "(button|submit|reset)",        "submit",       'n',    "",     "for use as form button"},
00286         {0, "usemap",   "IMG, INPUT, OBJECT",   "%URI;",        "#I",   'n',    "",     "use client-side image map"},
00287         {0, "valign",   "COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR",       "(top|middle|bottom|baseline)", "#I",   'n',    "",     "vertical alignment in cells"},
00288         {0, "value",    "INPUT",        "CDATA",        "#I",   'n',    "",     "Specify for radio buttons and checkboxes"},
00289         {0, "value",    "OPTION",       "CDATA",        "#I",   'n',    "",     "defaults to element content"},
00290         {0, "value",    "PARAM",        "CDATA",        "#I",   'n',    "",     "property value"},
00291         {0, "value",    "BUTTON",       "CDATA",        "#I",   'n',    "",     "sent to server when submitted"},
00292         {0, "value",    "LI",   "NUMBER",       "#I",   'D',    "L",    "reset sequence number"},
00293         {0, "valuetype",        "PARAM",        "(DATA|REF|OBJECT)",    "DATA", 'n',    "",     "How to interpret value"},
00294         {0, "version",  "HTML", "CDATA",        "%HTML.Version;",       'D',    "L",    "Constant"},
00295         {0, "vlink",    "BODY", "%Color;",      "#I",   'D',    "L",    "color of visited links"},
00296         {0, "vspace",   "APPLET, IMG, OBJECT",  "%Pixels;",     "#I",   'D',    "L",    "vertical gutter"},
00297         {0, "width",    "HR",   "%Length;",     "#I",   'D',    "L",    ""},
00298         {0, "width",    "IFRAME",       "%Length;",     "#I",   'f',    "L",    "frame width"},
00299         {0, "width",    "IMG, OBJECT",  "%Length;",     "#I",   'n',    "",     "override width"},
00300         {0, "width",    "TABLE",        "%Length;",     "#I",   'n',    "",     "table width"},
00301         {0, "width",    "TD, TH",       "%Length;",     "#I",   'D',    "L",    "width for cell"},
00302         {0, "width",    "APPLET",       "%Length;",     "#R",   'D',    "L",    "initial width"},
00303         {0, "width",    "COL",  "%MultiLength;",        "#I",   'n',    "",     "column width specification"},
00304         {0, "width",    "COLGROUP",     "%MultiLength;",        "#I",   'n',    "",     "default width for enclosed COLs"},
00305         {0, "width",    "PRE",  "NUMBER",       "#I",   'D',    "L",    "predefined characters width"},
00306         {0, NULL, NULL, NULL, NULL, '\0', NULL, NULL}};
00307 sAttrRefer* gHtmlParser::lAttrRef=nil;
00308 
00309 t_int16 gHtmlParser::nDefAttrNorms=-1;
00310 sAttrNorm gHtmlParser::lDefAttrNorms[]={
00311         { -2, "\0", NULL },  // all tags norm-behaviour
00312         { XH_IDTAG_ANCHOR, "HREF", NULL },
00313         { XH_IDTAG_IMG, "SRC, ALT", NULL },
00314         { -1, NULL, NULL }};
00315 
00316 // The following table was based on HTML-4.01:
00317 //              http://www.w3.org/TR/html401/index/attributes.html
00318 //
00319 sHAttrType gHtmlParser::lHAttrTypes[]={
00320         { -2, "%HTML-Attribute-types", e_HAT_None, e_HAT_Single }, // typeFamily here indicates nr of types
00321         { 1, "CDATA", e_HAT_CDATA, e_HAT_Single },
00322         { 1, "ID", e_HAT_ID, e_HAT_Single },
00323         { 1, "IDREF", e_HAT_IDREF, e_HAT_Single },
00324         { 1, "IDREFS", e_HAT_IDREFS, e_HAT_IDREF },
00325         { 1, "NAME", e_HAT_NAME, e_HAT_Single },
00326         { 1, "NUMBER", e_HAT_NUMBER, e_HAT_Single },
00327         { 0, "CAlign", e_HATp_CAlign, e_HAT_Single },  // "%CAlign;" referred; below, "%Character;", etc.
00328         { 0, "Character", e_HATp_Character, e_HAT_Single },
00329         { 0, "Charset", e_HATp_Charset, e_HAT_Single },
00330         { 0, "Charsets", e_HATp_Charsets, e_HATp_Charset },
00331         { 0, "Color", e_HATp_Color, e_HAT_Single },
00332         { 0, "ContentType", e_HATp_ContentType, e_HAT_Single },
00333         { 0, "ContentTypes", e_HATp_ContentTypes, e_HATp_ContentType },
00334         { 0, "Coords", e_HATp_Coords, e_HAT_Single },
00335         { 0, "Datetime", e_HATp_DateTime, e_HAT_Single },
00336         { 0, "FrameTarget", e_HATp_FrameTarget, e_HAT_Single },
00337         { 0, "HTML.Version", e_HATp_HtmlVersion, e_HAT_Single },
00338         { 0, "IAlign", e_HATp_IAlign, e_HAT_Single },
00339         { 0, "InputType", e_HATp_InputType, e_HAT_Single },
00340         { 0, "LAlign", e_HATp_LAlign, e_HAT_Single },
00341         { 0, "LanguageCode", e_HATp_LanguageCode, e_HAT_Single },
00342         { 0, "Length", e_HATp_Length, e_HAT_Single },
00343         { 0, "LinkTypes", e_HATp_LinkTypes, e_HAT_Single },
00344         { 0, "LIStyle", e_HATp_LIStyle, e_HAT_Single },
00345         { 0, "MediaDesc", e_HATp_MediaDesc, e_HAT_Single },
00346         { 0, "MultiLength", e_HATp_MultiLength, e_HAT_Single },
00347         { 0, "MultiLengths", e_HATp_MultiLengths, e_HATp_MultiLength },
00348         { 0, "OLStyle", e_HATp_OLStyle, e_HAT_Single },
00349         { 0, "Pixels", e_HATp_Pixels, e_HAT_Single },
00350         { 0, "Scope", e_HATp_Scope, e_HAT_Single },
00351         { 0, "Script", e_HATp_Script, e_HAT_Single },
00352         { 0, "Shape", e_HATp_Shape, e_HAT_Single },
00353         { 0, "StyleSheet", e_HATp_StyleSheet, e_HAT_Single },
00354         { 0, "TAlign", e_HATp_TAlign, e_HAT_Single },
00355         { 0, "Text", e_HATp_Text, e_HAT_Single },
00356         { 0, "TFrame", e_HATp_TFrame, e_HAT_Single },
00357         { 0, "TRules", e_HATp_TRules, e_HAT_Single },
00358         { 0, "ULStyle", e_HATp_ULStyle, e_HAT_Single },
00359         { 0, "URI", e_HATp_URI, e_HAT_Single },
00360         { 10, "1|0", e_HATp_EnumChoice, -1 },   // Here start the choices
00361         { 10, "button|submit|reset", e_HATp_EnumChoice, -1 },
00362         { 10, "checked", e_HATp_EnumChoice, -1 },
00363         { 10, "compact", e_HATp_EnumChoice, -1 },
00364         { 10, "DATA|REF|OBJECT", e_HATp_EnumChoice, -1 },
00365         { 10, "declare", e_HATp_EnumChoice, -1 },
00366         { 10, "defer", e_HATp_EnumChoice, -1 },
00367         { 10, "disabled", e_HATp_EnumChoice, -1 },
00368         { 10, "GET|POST", e_HATp_EnumChoice, -1 },
00369         { 10, "ismap", e_HATp_EnumChoice, -1 },
00370         { 10, "left|all|right|none", e_HATp_EnumChoice, -1 },
00371         { 10, "left|center|right|justify|char", e_HATp_EnumChoice, -1 },
00372         { 10, "left|center|right|justify", e_HATp_EnumChoice, -1 },
00373         { 10, "left|center|right", e_HATp_EnumChoice, -1 },
00374         { 10, "ltr|rtl", e_HATp_EnumChoice, -1 },
00375         { 10, "multiple", e_HATp_EnumChoice, -1 },
00376         { 10, "nohref", e_HATp_EnumChoice, -1 },
00377         { 10, "noresize", e_HATp_EnumChoice, -1 },
00378         { 10, "noshade", e_HATp_EnumChoice, -1 },
00379         { 10, "nowrap", e_HATp_EnumChoice, -1 },
00380         { 10, "readonly", e_HATp_EnumChoice, -1 },
00381         { 10, "selected", e_HATp_EnumChoice, -1 },
00382         { 10, "top|middle|bottom|baseline", e_HATp_EnumChoice, -1 },
00383         { 10, "yes|no|auto", e_HATp_EnumChoice, -1 },
00384         { -2, "4.01", e_HAT_None, -1 },
00385         { -2, "v0.0", e_HAT_None, -1 }};
00386 
00387 // 7591 is prime, see http://www.math.utah.edu/~alfeld/math/primelist.html
00388 gHashTriple* gHtmlParser::hElems=nil;// SIZE( 7591 )
00389 ////////////////////////////////////////////////////////////
00390 sAttrRefer::sAttrRefer (t_int16 nAttrs)
00391     : nlAttrs( nAttrs ),
00392       sAttrs( nil ),
00393       lstRelated( nil ),
00394       idxUniqMax( -1 ),
00395       sLUniqs( nil ),
00396       idxLUniqs( nil )
00397 {
00398  sAttrs = new gString[ nlAttrs ];
00399  ASSERTION(sAttrs!=nil,"sAttrs!=nil");
00400  lstRelated = new gSmartList[ nlAttrs ];
00401  ASSERTION(lstRelated!=nil,"lstRelated!=nil");
00402  lstIsAllButEtc = new gSwitch[ nlAttrs ];
00403  ASSERTION(lstIsAllButEtc!=nil,"lstIsAllButEtc!=nil");
00404  sLUniqs = new gString[ nlAttrs ];
00405  ASSERTION(sLUniqs!=nil,"sLUniqs!=nil");
00406  idxLUniqs = new t_int16[ nlAttrs ];
00407  ASSERTION(idxLUniqs!=nil,"idxLUniqs!=nil");
00408  for (t_int16 idx=0; idx<nlAttrs; idx++) {
00409      idxLUniqs[ idx ] = -1;
00410  }
00411 }
00412 
00413 sAttrRefer::~sAttrRefer ()
00414 {
00415  delete[] sAttrs;
00416  delete[] lstRelated;
00417  delete[] lstIsAllButEtc;
00418  delete[] sLUniqs;
00419  delete[] idxLUniqs;
00420 }
00421 
00422 t_int16 sAttrRefer::FindAttr (char* attrName, t_int16& uniqIdx)
00423 {
00424  // Returns the first absolute index (for lAttrRef), or -1 if not found.
00425  for (uniqIdx=0; uniqIdx<=idxUniqMax; uniqIdx++) {
00426      if ( sLUniqs[ uniqIdx ].Match( attrName ) ) {
00427          return idxLUniqs[ uniqIdx ];
00428      }
00429  }
00430  uniqIdx = -1;
00431  return -1;
00432 }
00433 
00434 t_int16 sAttrRefer::FindAttr (char* attrName, char* strTag, t_int16& uniqIdx)
00435 {
00436  bool isAllBut;
00437  t_int16 idxAbs, firstIdx = FindAttr( attrName, uniqIdx );
00438 
00439  sOutHelper.SetEmpty();
00440 
00441  if ( firstIdx<0 ) return -1;
00442  ASSERTION(uniqIdx>=0,"uniqIdx>=0");
00443 
00444  // Within attributes firstIdx to the last attribute with the same name,
00445  // compare 'strTag' with respective attribute 'related tags'
00446  for (idxAbs=firstIdx; idxAbs<nlAttrs; idxAbs++) {
00447      isAllBut = lstIsAllButEtc[ idxAbs ].IsOn();
00448      if ( isAllBut==false ) {
00449          // Usual case...
00450          if ( lstRelated[ idxAbs ].Match( strTag )>0 )
00451              return idxAbs;  // Note uniqIdx surely matches!
00452      }
00453      else {
00454          if ( lstRelated[ idxAbs ].Match( strTag )==0 )
00455              return idxAbs;
00456          sOutHelper.Set( lstRelated[ idxAbs ].Str(1) );
00457          // TODO: replicate for all other finds
00458      }
00459 
00460      // We can afford to make the following evaluation here
00461      // (since the first match occurred)
00462      if ( sAttrs[ idxAbs ].Match( attrName )==false ) break;
00463  }
00464  // No attribute for the tag: 'strTag'
00465  return -1;
00466 }
00467 ////////////////////////////////////////////////////////////
00468 gHtmlCouple::gHtmlCouple (unsigned lineNr, char* sText)
00469     : iLine( lineNr ),
00470       idTag( XH_NOTAG ),
00471       idEndTag( -1 ),
00472       pHStr( nil ),
00473       pElem( nil ),
00474       synError( 0 ),
00475       coupleId( -1 ),
00476       theDocType( 0 ),
00477       oCouple( nil )
00478 {
00479  if ( sText!=nil ) AddText( sText );
00480 }
00481 
00482 gHtmlCouple::gHtmlCouple (unsigned lineNr, char* strTag, char* sAttrLst, bool doAddSkippedTags)
00483     : iLine( lineNr ),
00484       idTag( -1 ),
00485       idEndTag( -1 ),
00486       pHStr( nil ),
00487       pElem( nil ),
00488       synError( 0 ),
00489       coupleId( -1 ),
00490       theDocType( 0 ),
00491       oCouple( nil )
00492 {
00493  DBGPRINT_MIN("DBG: Couple:%s, sAttrLst:%s\n",strTag,sAttrLst);
00494  AddTag( strTag );
00495  if ( synError==0 || (synError==XH_SKIP_TAG && doAddSkippedTags==true) ) {
00496      if ( sAttrLst!=nil ) {
00497          Add( sAttrLst );
00498          // Update attrL with the list of attributes found in string 'sAttrLst'
00499          attrL.Set( sAttrLst );
00500      }
00501  }
00502 }
00503 
00504 gHtmlCouple::~gHtmlCouple ()
00505 {
00506  delete pHStr;
00507  delete oCouple;
00508 }
00509 
00510 bool gHtmlCouple::IsOk ()
00511 {
00512  ASSERTION(IsText() || (IsText()==false && ((idTag>=0 && pElem!=nil) || (idTag==-1 && synError==-1 && pElem==nil))),"gHtmlCouple::IsOk");
00513  return synError==0;
00514 }
00515 
00516 char* gHtmlCouple::GetStr ()
00517 {
00518  char* str;
00519  sWholeTag.SetEmpty();
00520  if ( IsText() ) {
00521      str = Str( 1 );
00522      if ( pHStr==nil ) return str;
00523      char* strX = pHStr->Str();
00524      if ( strX==nil ) return nil;  // Never happens
00525      if ( strX[0]==0 ) return str;  // May happen if there is an invalid symbol within the string (e.g. amp: &)
00526      return strX;
00527  }
00528  sWholeTag.Add( '<' );
00529  if ( IsTagEnd() ) sWholeTag.Add( '/' );
00530  sWholeTag.AddString( sTag );
00531  if ( N()==2 ) {
00532      sWholeTag.Add( ' ' );
00533      // attrL.Str provides the full concatenated string of the attribute assignments
00534      sWholeTag.Add( attrL.Str() );
00535  }
00536  sWholeTag.Add( '>' );
00537  return sWholeTag.Str();
00538 }
00539 
00540 char* gHtmlCouple::GetStrForTree ()
00541 {
00542  char* str;
00543 
00544  sKeepStr = "TODO:::TODO:::";
00545  sWholeTag.SetEmpty();
00546  if ( IsText() ) {
00547      str = Str( 1 );
00548      if ( pHStr==nil ) return str;
00549      char* strX = pHStr->Str();
00550      if ( strX==nil ) return nil;  // Never happens
00551      if ( strX[0]==0 ) return str;  // May happen if there is an invalid symbol within the string (e.g. amp: &)
00552      return strX;
00553  }
00554  sWholeTag.Add( '<' );
00555  if ( IsTagEnd() ) sWholeTag.Add( '/' );
00556  sWholeTag.AddString( sTag );
00557  if ( N()==2 ) {
00558      sWholeTag.Add( ' ' );
00559      // attrL.Str provides the full concatenated string of the attribute assignments
00560      sWholeTag.Add( attrL.Str() );
00561  }
00562  sWholeTag.Add( '>' );
00563  sprintf(sKeepStr.Str(),"[%u]",N());
00564  sWholeTag.AddString( sKeepStr );
00565  return sWholeTag.Str();
00566 }
00567 
00568 gString& gHtmlCouple::TagString (bool forceEnd)
00569 {
00570  sTagStr.SetEmpty();
00571  if ( IsText() ) return sTagStr;
00572  sTagStr.Add( '<' );
00573  if ( forceEnd || IsTagEnd() ) sTagStr.Add( '/' );
00574  sTagStr.AddString( sTag );
00575  sTagStr.Add( '>' );
00576  return sTagStr;
00577 }
00578 
00579 char* gHtmlCouple::GetHRef ()
00580 {
00581  if ( IsText()==true ) return nil;
00582  if ( IsAnchor()==false || IsTagEnd()==true ) return nil;
00583  return attrL.Find( "HREF", true );
00584 }
00585 
00586 unsigned gHtmlCouple::Add (char* s)
00587 {
00588  gString sTrim( s );
00589  sTrim.Trim();
00590  return gList::Add( sTrim );
00591 }
00592 
00593 unsigned gHtmlCouple::Add (gString& copy)
00594 {
00595  return Add( copy.Str() );
00596 }
00597 
00598 unsigned gHtmlCouple::AddText (char* s)
00599 {
00600  // Return 1 iff the text is valid
00601  Add( s );
00602  ASSERTION(N()==1,"N()==1");
00603  s = Str( 1 );  // Trimmed text string
00604  pHStr = new gHtmlString( s );
00605  ASSERTION(pHStr!=nil,"pHStr!=nil");
00606  return (unsigned)pHStr->IsOk();
00607 }
00608 
00609 unsigned gHtmlCouple::AddTag (char* strTag)
00610 {
00611  t_uchar uChr;
00612  // Return 0 if tag is empty, or basically invalid; 1 otherwise
00613  ASSERTION(strTag!=nil,"strTag!=nil");
00614  gString s( strTag );
00615  s.Trim();
00616  if ( s.IsEmpty() ) return 0;
00617  gString sUp( s );
00618  uChr = s[1];
00619  if ( uChr=='!' || uChr=='?' ) {
00620      synError = XH_SKIP_TAG;
00621      theDocType = (t_int16)sUp.Match("!DOCTYPE");
00622  }
00623  // synError<-1 not yet implemented (at least here not used)
00624  sUp.UpString();
00625  gList::Add( sUp ); // Add(char*) is not called, it is not virtual, ok
00626  sTag = sUp;
00627  return 1;
00628 }
00629 ////////////////////////////////////////////////////////////
00630 gHtmlContent::gHtmlContent ()
00631     : nLines( 0 ),
00632       theHtmlOpt( nil )
00633 {
00634 }
00635 
00636 gHtmlContent::~gHtmlContent ()
00637 {
00638 }
00639 
00640 gHtmlCouple* gHtmlContent::GetCouple (unsigned idx)
00641 {
00642  ASSERTION(idx>0,"idx>0");
00643  if ( IsValidIndex(idx)==false ) return nil;
00644  gStorage* pObjx = GetObjectPtr( idx );
00645  ASSERTION(pObjx!=nil,"pObjx!=nil");
00646  return (gHtmlCouple*)pObjx;
00647 }
00648 
00649 char* gHtmlContent::Str (unsigned idx)
00650 {
00651  gHtmlCouple* pObjC = GetCouple( idx );
00652  if ( pObjC==nil ) return nil;
00653  // => simplified method: now GetStr made by HtmlCouple!
00654  return pObjC->GetStr();
00655 }
00656 
00657 unsigned gHtmlContent::Add (char* s)
00658 {
00659  nLines++;
00660  if ( s==nil || s[0]==0 ) return 0;
00661  thisAddHmtlLine( nLines, s );
00662  return nLines;
00663 }
00664 
00665 int gHtmlContent::UpCaseAttributes (char* strTag, gString& sRes)
00666 {
00667  // Return 0 if ok; note it will eventually manipulates sRes (if tidyed)
00668  unsigned i, n;
00669  unsigned idx;
00670  t_uchar uChr;
00671  short quoteCount=0;
00672  bool tidyApplied=false;
00673 
00674  ASSERTION(strTag!=nil,"strTag!=nil");
00675 
00676  // Check if tidy was requested
00677  if ( GetHtmlOpt().outOpt.HasTidy( strTag, idx ) ) {
00678      // Apply tidy
00679      gString s;
00680      // Tidy blanks first
00681      for (i=1, n=sRes.Length(); i<=n; i++) {
00682          uChr = sRes[i];
00683          if ( uChr=='"' ) quoteCount = quoteCount==0;
00684          if ( uChr=='=' && quoteCount==0 ) {
00685              s.TrimRight();
00686          }
00687          if ( uChr==' ' ) {
00688              if ( sRes[i-1]=='=' ) continue;
00689          }
00690          s.Add( uChr );
00691      }
00692      sRes = s;
00693      DBGPRINT("DBG: Apply tidy for [%s|%s] (idx=%u)\n",strTag,sRes.Str(),idx);
00694      s.SetEmpty();
00695      gParam aParam( sRes, " ", gParam::e_NormalQuoted );
00696      // Now split each param, and add them
00697      for (i=1, n=aParam.N(); i<=n; i++) {
00698          gString sAssign;
00699          gString sTemp( aParam.Str(i) );
00700          sTemp.Trim();
00701          gParam paramVal( sTemp, "=", gParam::e_StopSplitOnFirst );
00702          unsigned nVal = paramVal.N();
00703          ASSERTION(nVal==1 || nVal==2,"nVal...");
00704          sAssign.Set( paramVal.Str(1) );
00705          if ( nVal>=2 ) {
00706              sAssign.UpString();  // Only the assign-lval (e.g. SRC of a <IMG SRC=...>)
00707              // Check if attribute-value is to be tidyed
00708              bool doTidyAttr = GetHtmlOpt().outOpt.HasTidyAttr( strTag, sAssign );
00709              gString sVal( paramVal.Str(2) );
00710              sVal.Trim();  // academic
00711              if ( doTidyAttr ) {
00712                  if ( sVal[1]!='"' && sVal[1]!='\'' ) {
00713                      gString sTempVal( "\"" );
00714                      sTempVal.AddString( sVal );
00715                      sTempVal.Add( "\"" );
00716                      sVal = sTempVal;
00717                  }
00718                  else {
00719                      switch ( sVal[1] ) {
00720                      case '"':
00721                          if ( sVal[sVal.Length()]=='\'' ) {
00722                              sVal[sVal.Length()] = '"';
00723                          }
00724                          break;
00725                      case '\'':
00726                          // Tidy 'abc' to "ABC" if double-quote not found
00727                          if ( sVal.Find( '"' )==0 && sVal[sVal.Length()]=='\'' ) {
00728                              sVal[1] = '"';
00729                              sVal[sVal.Length()] = '"';
00730                          }
00731                      default:
00732                          break; //nothing
00733                      }
00734                  }
00735              }//end IF doTidy...
00736              sAssign.Add( "=" );
00737              sAssign.AddString( sVal );
00738          }
00739          DBGPRINT("DBG: param: [%s]\n",sAssign.Str());
00740          if ( i>1 ) s.Add( " " );
00741          s.AddString( sAssign );
00742      }
00743      tidyApplied = sRes.Match( s )==false;
00744 
00745      DBGPRINT_WEB3("DBG: Applied tidy(%c): [%s|%s]\n",ISyORn(tidyApplied),sRes.Str(),s.Str());
00746 
00747      // Replace the result
00748      sRes = s;
00749  }
00750 
00751  for (i=1, n=sRes.Length(), quoteCount=0; i<=n; i++) {
00752      uChr = sRes[i];
00753      if ( uChr=='"' ) quoteCount = quoteCount==0;
00754      if ( quoteCount ) continue;
00755      if ( uChr>='a' && uChr<='z' ) sRes[i] = uChr-32;
00756  }
00757 
00758  return quoteCount!=0;
00759 }
00760 
00761 gHtmlOpt& gHtmlContent::GetHtmlOpt ()
00762 {
00763  ASSERTION(theHtmlOpt!=nil,"theHtmlOpt!=nil");
00764  return *theHtmlOpt;
00765 }
00766 
00767 bool gHtmlContent::SetHtmlOpt (gHtmlOpt* pHtmlOpt)
00768 {
00769  return (theHtmlOpt = pHtmlOpt)!=nil;
00770 }
00771 
00772 int gHtmlContent::TagError (unsigned lineNr, int error, char* sLine, char* sShortMsg)
00773 {
00774  FILE* fRepErr = stderr;
00775 
00776  if ( error==0 ) return 0;
00777  HTML_LOG(fRepErr,LOG_ERROR,"Line %u: Invalid syntax (int-ref %d): '%s'. %s\n",
00778           lineNr,
00779           error,
00780           sLine,
00781           sShortMsg);
00782  return error;
00783 }
00784 
00785 void gHtmlContent::Show (bool doShowAll)
00786 {
00787  unsigned i, n=N();
00788 
00789  for (i=1; i<=n; i++) {
00790      if ( doShowAll ) printf("C%u/%u: ",i,n);
00791      printf("%s\n",Str(i));
00792  }
00793 }
00794 
00795 int gHtmlContent::thisAddHmtlLine (unsigned lineNr, char* s)
00796 {
00797  gString sLine( s ), sRes;
00798  unsigned pos, posEnd, posTagEnd, len=sLine.Length();
00799 
00800  ASSERTION(len>0,"len>0");
00801  sLine.Trim();  // Not absolutely necessary
00802  pos = sLine.Find('<');
00803  posEnd = sLine.Find('>');
00804  if ( pos>0 ) {
00805      if ( posEnd>0 && posEnd+1>pos ) {
00806          sRes.CopyFromTo( sLine, pos+1, posEnd-1 );
00807          sRes.Trim();
00808          posTagEnd = sRes[1]=='/';
00809          return thisAddHtmlTag( lineNr, sRes.Str()+posTagEnd, posTagEnd>0 );
00810      }
00811  }
00812  return thisAddHtmlText( lineNr, sLine.Str() );
00813 }
00814 
00815 int gHtmlContent::thisAddHtmlText (unsigned lineNr, char* sText)
00816 {
00817  // Returns always 0
00818  gHtmlCouple* pCouple;
00819 
00820  pCouple = new gHtmlCouple( lineNr, sText );
00821  thisAddCouple( pCouple, *this );
00822  return 0;
00823 }
00824 
00825 int gHtmlContent::thisAddHtmlTag (unsigned lineNr, char* strTag, bool isEndTag)
00826 {
00827  // Return 0 if all ok
00828  int error=0;
00829  unsigned pos;
00830  gHtmlCouple* pCouple;
00831 
00832  ASSERTION(strTag!=nil,"strTag!=nil");
00833 
00834  // sTag can be just, e.g. "BODY"
00835  // or eventually "BODY LINK=xyz"
00836  pos = gStrControl::Self().Find( strTag, " " );
00837  if ( pos==0 ) {
00838      pCouple = new gHtmlCouple( lineNr, strTag, nil );
00839  }
00840  else {
00841      gString sMainTag( strTag );
00842      sMainTag[ pos ] = 0;
00843      DBGPRINT("DBG: thisAddHtmlTag: %s [%s|%s]\n",strTag,sMainTag.Str(),strTag);
00844      strTag += pos;
00845      gString sAttr( strTag );
00846      sAttr.Trim();
00847      UpCaseAttributes( sMainTag.Str(), sAttr );
00848      pCouple = new gHtmlCouple( lineNr, sMainTag.Str(), sAttr.Str() );
00849      error = pCouple->attrL.lastOpError;
00850      TagError( lineNr, error, strTag, "Invalid attribute" );
00851  }
00852  pCouple->idEndTag = isEndTag ? XH_ENDTAG : -1;  // Instead of being limitted to pos==0.
00853  thisAddCouple( pCouple, *this );
00854  return error;
00855 }
00856 
00857 int gHtmlContent::thisAddCouple (gHtmlCouple* pCouple, gList& oL)
00858 {
00859  ASSERTION(pCouple!=nil,"pCouple!=nil");
00860  oL.AppendObject( pCouple );
00861  return 0;
00862 }
00863 ////////////////////////////////////////////////////////////
00864 gHtmlCouple* gHParsed::CurrentCouple ()
00865 {
00866  gHtmlCouple* pCouple;
00867  ASSERTION(Depth()>0,"Depth()>0");
00868  pCouple = (gHtmlCouple*)GetCurrent();
00869  ASSERTION(pCouple!=nil,"pCouple!=nil");
00870  return pCouple;
00871 }
00872 
00873 char* gHParsed::Str (unsigned idx)
00874 {
00875  return GetCouple( idx )->GetStr();
00876 }
00877 
00878 char* gHParsed::StrMust (unsigned idx)
00879 {
00880  gHtmlCouple* pCouple;
00881  pCouple = (gHtmlCouple*)kMust.GetObjectPtr( idx );
00882  ASSERTION(pCouple!=nil,"pCouple!=nil");
00883  return pCouple->GetStr();
00884 }
00885 
00886 gHtmlCouple* gHParsed::GetCouple (unsigned idx)
00887 {
00888  ASSERTION(IsValidIndex(idx),"IsValidIndex(idx)");
00889  gStorage* pObx = GetObjectPtr( idx );
00890  ASSERTION(pObx!=nil,"pObx!=nil");
00891  return (gHtmlCouple*)pObx;
00892 }
00893 
00894 gHtmlCouple* gHParsed::FindCouple (t_int16 idTag)
00895 {
00896  unsigned idx, n=N();
00897  gHtmlCouple* pCouple;
00898 
00899  if ( idTag<0 ) return nil;
00900  for (idx=1; idx<=n; idx++) {
00901      pCouple = GetCouple( idx );
00902      if ( pCouple->idTag==idTag ) return pCouple;
00903  }
00904  return nil;
00905 }
00906 
00907 void gHParsed::PushTagOptEnd (gHtmlCouple& couple)
00908 {
00909  thisPushCouple( couple, true );
00910 }
00911 
00912 int gHParsed::PushTag (gHtmlCouple& couple, bool doCheckOnly)
00913 {
00914  t_int16 id = couple.idTag;
00915  int error=0;
00916 
00917  if ( doCheckOnly==false ) thisPushCouple( couple, false );
00918 
00919  ///printf("DBG: ###PUSH: "); for (unsigned dbgIdx=1; dbgIdx<=kMust.N(); dbgIdx++) printf("[%s]",StrMust(dbgIdx)); printf("DBG: ###(%s).\n",couple.sTag.Str());
00920 
00921  switch ( id ) {
00922  case XH_IDTAG_BASE:
00923      // Cowardly not assuming an error,
00924      // because here you base use, e.g.: '<BASE TARGET="_blank">'
00925      // and also '<BASE HREF...>'; therefore we choose not to check.
00926      // COMMENTED, see above: if ( hasBaseHRef ) return 101;  // More than one BASE_HREF
00927      hasBaseHRef = true;
00928      if ( state!=e_HS_Head ) return 102;  // Badly placed BASE_HREF
00929      break;
00930  case XH_IDTAG_BODY:
00931      error = state==e_HS_HeadAfter ? 0 : -103; // BODY not after HEAD
00932      state = e_HS_Body;
00933      break;
00934  case XH_IDTAG_HEAD:
00935      error = state==e_HS_Html || state==e_HS_StartAfter ? 0 : -104; // HEAD not after HTML
00936      state = e_HS_Head;
00937      break;
00938  case XH_IDTAG_HTML:
00939      error = state==e_HS_Start ? 0 : -105; // <HTML> not at begining
00940      state = e_HS_Html;
00941      break;
00942  default:
00943      if ( state==e_HS_Start ) {
00944          state = e_HS_StartAfter;
00945          return -106;  // <HTML> optionally expected
00946      }
00947      if ( state==e_HS_End ) return 107;  // </HTML> already parsed
00948      if ( state==e_HS_HeadAfter ) return -108;  // Invalid tag after </HEAD>
00949      if ( state==e_HS_BodyAfter ) return -109;  // Invalid tag after </BODY>
00950      return 0;
00951  }
00952 
00953  return error;
00954 }
00955 
00956 int gHParsed::PopTag (bool hasOptEnd)
00957 {
00958  t_int16 id;
00959  int error=0;
00960  gHtmlCouple* pCouple = CurrentCouple();
00961 
00962  ///printf("DBG: ###POP_: "); for (unsigned dbgIdx=1; dbgIdx<=kMust.N(); dbgIdx++) printf("[%s]",StrMust(dbgIdx)); printf("DBG: ### line %u:(%s).\n",((gHtmlCouple*)kMust.EndPtr()->me)->iLine,((gHtmlCouple*)kMust.EndPtr()->me)->sTag.Str());
00963 
00964  id = pCouple->idTag;
00965 
00966  Pop();
00967  if ( hasOptEnd ) return 0;
00968 
00969  kMust.Pop();
00970 
00971  switch ( id  ) {
00972  case XH_IDTAG_BODY:
00973      error = state==e_HS_Body ? 0 : 121;
00974      state = e_HS_BodyAfter;
00975      break;
00976  case XH_IDTAG_HEAD:
00977      error = state==e_HS_Head ? 0 : 122;
00978      state = e_HS_HeadAfter;
00979      break;
00980  case XH_IDTAG_HTML:
00981      error = state==e_HS_BodyAfter ? 0 : 123;
00982      state = e_HS_End;
00983      break;
00984  default:
00985      break;
00986  }
00987 
00988  return 0;
00989 }
00990 
00991 void gHParsed::TrashLast ()
00992 {
00993  DBGPRINT("DBG: trash: %s (N=%u kMustN=%u)\n",CurrentCouple()->sTag.Str(),N(),kMust.N());
00994  Pop();
00995 }
00996 
00997 int gHParsed::thisPush (gHtmlCouple& couple, gStack& aStack)
00998 {
00999  gHtmlCouple* newCouple;
01000  newCouple = new gHtmlCouple( couple.iLine, couple.sTag.Str(), nil );
01001  ASSERTION(newCouple!=nil,"newCouple!=nil");
01002  newCouple->CopyTag( couple );
01003  aStack.Push( newCouple );
01004  return 0;
01005 }
01006 
01007 int gHParsed::thisPushCouple (gHtmlCouple& couple, bool hasOptEnd)
01008 {
01009  thisPush( couple, *this );
01010  if ( hasOptEnd==false ) thisPush( couple, kMust );
01011  //printf("DBG: PUSH: %s (hasOptEnd?%d)\n",couple.sTag.Str(),hasOptEnd);
01012  return 0;
01013 }
01014 ////////////////////////////////////////////////////////////
01015 gHtmlCouple* gHList::GetCouple (unsigned idx)
01016 {
01017  if ( idx>0 ) return gHtmlContent::GetCouple( idx );
01018  ASSERTION(coupleFakeBody!=nil,"coupleFakeBody!=nil");
01019  return coupleFakeBody;
01020 }
01021 
01022 void gHList::AppendCouple (gHtmlCouple& couple)
01023 {
01024  gHtmlCouple* newCouple;
01025  newCouple = new gHtmlCouple( couple.iLine, couple.sTag.Str(), nil );
01026  ASSERTION(newCouple!=nil,"newCouple!=nil");
01027  // If there are attributes, copy them also
01028  if ( couple.attrL.IsEmpty()==false ) {
01029      newCouple->Add( couple.attrL.Str() );
01030  }
01031  newCouple->CopyTag( couple );
01032  AppendObject( newCouple );
01033 }
01034 ////////////////////////////////////////////////////////////
01035 gHtmlParser::gHtmlParser (gUnweb* ptrUnweb)
01036     : docType( 0 ),
01037       nErrorsSyntax( 0 ),
01038       nErrorsOther( 0 ),
01039       nWarnings( 0 ),
01040       lastWarnOpCode( 0 ),
01041       pUnweb( ptrUnweb )
01042 {
01043  lastOpError = thisInitTbl( nElems );
01044  ASSERTION(lastOpError==0,"lastOpError==0");
01045 }
01046 
01047 gHtmlParser::~gHtmlParser ()
01048 {
01049 }
01050 
01051 sHtmlElement* gHtmlParser::GetTagElement (t_int16 idxTag)
01052 {
01053  if ( idxTag<0 || idxTag>nElems ) return nil;
01054  return &lElems[ idxTag ];
01055 }
01056 
01057 sAttrDef* gHtmlParser::GetAttrDef (t_int16 idxAttr)
01058 {
01059  ASSERTION(lAttrRef!=nil,"lAttrRef!=nil");
01060  ASSERTION(idxAttr>=0 && idxAttr<lAttrRef->nlAttrs,"GetAttrDef(3)");
01061  return &lAttrs[ idxAttr ];
01062 }
01063 
01064 sAttrRefer* gHtmlParser::GetAttrRef ()
01065 {
01066  ASSERTION(lAttrRef!=nil,"lAttrRef!=nil");
01067  return lAttrRef;
01068 }
01069 
01070 sAttrNorm* gHtmlParser::GetDefaultAttrNorm (t_int16 idxNorm)
01071 {
01072  if ( idxNorm<0 || idxNorm>nDefAttrNorms ) return nil;
01073  return &lDefAttrNorms[ idxNorm ];
01074 }
01075 
01076 sHAttrType* gHtmlParser::GetAttrType (t_int16 idxType)
01077 {
01078  t_int16 n = GetNAttrType();
01079  if ( idxType<1 || idxType>n ) return nil;
01080  return &lHAttrTypes[ idxType ];
01081 }
01082 
01083 void gHtmlParser::ReleaseHash ()
01084 {
01085  t_int16 idx;
01086  // Delete hash for tags
01087  delete hElems; hElems = nil;
01088  nElems = -1;
01089  // Delete "Related Elements" lists
01090  delete lAttrRef; lAttrRef = nil;
01091 
01092  for (idx=0; idx<=nDefAttrNorms; idx++) {
01093      delete lDefAttrNorms[ idx ].pAttrSeq;
01094      lDefAttrNorms[ idx ].pAttrSeq = nil;
01095  }
01096 }
01097 
01098 bool gHtmlParser::SetOptions (gHtmlOpt& copy)
01099 {
01100  htmlOpt.CopyOptions( copy );
01101 
01102  t_uint16 i, n=htmlOpt.lIdTagOptEnd.N();
01103  t_int16 id, nMax = nElems;
01104  for (i=1; i<=n; i++) {
01105      id = htmlOpt.lIdTagOptEnd.GetInt( i );
01106      if ( id<0 || id>=nMax ) return false;
01107      char cOpt = lElems[id].optEnd;
01108      DBGPRINT("DBG: CFG_OPTEND: id=%d: %s (%c)\n",id,lElems[id].elemName,cOpt);
01109      if ( cOpt!='@' && cOpt!=XH_TBL_TAG_OPTEND ) return false;
01110      lElems[id].optEnd = XH_TBL_TAG_OPTEND_CFG;
01111  }
01112 
01113  htmlInput.SetHtmlOpt( &htmlOpt );
01114 
01115  return true;
01116 }
01117 
01118 int gHtmlParser::Parse (FILE* fRepErr)
01119 {
01120  int error = thisParse( fRepErr );
01121 
01122  return error;
01123 }
01124 
01125 int gHtmlParser::SetError (int opError)
01126 {
01127   lastOpError = opError;
01128   gControl::SetError( 0 );  // sStrError init'zed
01129   if ( opError==0 ) return 0;
01130   return lastOpError = opError;
01131 }
01132 
01133 int gHtmlParser::SetWarn (int opError)
01134 {
01135  lastWarnOpCode = opError;
01136  if ( opError==0 ) return 0;
01137 
01138  nWarnings++;
01139  return opError;
01140 }
01141 
01142 int gHtmlParser::ShowTree (FILE* fRepErr)
01143 {
01144  // Return 0 on success; -1 if there is nothing to show
01145  short p, nFilled;
01146  unsigned n;
01147  gHSeq hSeq( fRepErr );
01148  eHState aState, showState=e_HS_Body;
01149 
01150  hSeq.SetBaseHRef( myBaseHRef );
01151 
01152 #ifdef DEBUG
01153  for (p=0; p<(short)e_HS_Last; p++) {
01154      n = lParts[p].N();
01155      if ( n==0 ) continue;
01156      printf("ShowPart(%d)--->START<---\n",p);
01157      hSeq.ShowPart( lParts[p], (eHState)p, htmlOpt );
01158      printf("ShowPart(%d)--->END<---\n",p);
01159  }
01160 #endif //DEBUG(_...)
01161 
01162  for (p=0, nFilled=0, aState=e_HS_Body; p<(short)e_HS_Last; p++) {
01163      n = lParts[p].N();
01164      if ( n==0 ) continue;
01165      if ( nFilled==0 ) {
01166          aState = (eHState)p;
01167      }
01168      nFilled++;
01169      DBGPRINT("DBG: nFilled=%d, aState=%d, showState=%d\n",
01170               (int)nFilled,
01171               aState,
01172               showState);
01173  }
01174 
01175  if ( nFilled<=0 ) return -1;
01176 
01177  if ( nFilled==1 ) {
01178      // Simulating we have a BODY statement at begin of doc
01179      showState = e_HS_Sp_FakeBody;
01180  }
01181  else {
01182      // Usually this is the normal case, LTD formatted docs contain in this order:
01183      //   HTML, TITLE, HEAD, BODY
01184      // and we want to show the doc body.
01185      aState = showState;
01186  }
01187 
01188  hSeq.Build( lParts[aState], showState );
01189  hSeq.hTree.Show( true );
01190 
01191  return 0;
01192 }
01193 
01194 
01195 #ifdef DEBUG
01196 int gHtmlParser::Show_dbg (bool doShowAll)
01197 {
01198  int ex;
01199  t_int16 id;
01200  for (ex=1; ex<=(int)htmlInput.N(); ex++) {
01201      gStorage* pObx = htmlInput.GetObjectPtr(ex);
01202      gHtmlCouple* pObC = (gHtmlCouple*)pObx;
01203      thisFindTag( pObC->sTag.Str(), id ) ;
01204      printf("DBG: line %u: %d/%d [%s]",
01205             pObC->iLine,
01206             ex, (short)htmlInput.N(),
01207             htmlInput.Str(ex));
01208      if ( pObC->IsText() )
01209          printf("\n");
01210      else
01211          printf(" TAG:%s {id:%d}\n",pObC->sTag.Str(),id);
01212      /// OLD DEBUG:
01213      ///printf("DBG:[%s], @@@%d (OriginalInputLine=%u, synErr=%d %s)\n",pObC->Str(1),pObC->idTag,pObC->iLine,pObC->synError,pObC->IsTagEnd()?"IsTagEnd":(pObC->IsText()?"TXT":"Tag"));
01214      ///if ( pObC->pHStr!=nil ) printf("DBG:TEXT: [%s]\n\n",pObC->pHStr->Str());
01215  }
01216  return 0;
01217 }
01218 #endif //DEBUG
01219 
01220 int gHtmlParser::thisFillFromUnweb (gUnweb& unweb, gHtmlContent& hInput)
01221 {
01222  unsigned i, nPost = unweb.coordSerial.N();
01223 
01224  for (i=1; i<=nPost; i++) {
01225      hInput.Add( unweb.Str( i ) );
01226  }
01227  return 0;
01228 }
01229 
01230 sHtmlElement* gHtmlParser::thisFindTag (char* strTag, t_int16& idxTag)
01231 {
01232  // Returns the element, or null if not found.
01233  sHtmlElement* pElem;
01234  gHashElemTriple* pHashElem;
01235 
01236  idxTag = -1;
01237  if ( strTag==nil || strTag[0]==0 ) return nil;
01238 
01239  //for (idxTag=0; idxTag<nElems; idxTag++) {
01240  //    pElem = &lElems[idxTag];
01241  //    if ( gStrControl::Self().Match(strTag,pElem->elemName) )
01242  //      return pElem;
01243  //}
01244 
01245  gKey hKey( strTag );
01246  unsigned idx;
01247  int x;
01248  pHashElem = hElems->Find( hKey, idx, x );
01249  if ( pHashElem==nil ) return nil;
01250  // Particularity of this hash-function: for HTML tags, usually one per hash-row!
01251  // x is the row from the hash-table: usually 1.
01252  idxTag = (t_int16)pHashElem->iVal;
01253  ASSERTION(idxTag<nElems,"idxTag<nElems");
01254  pElem = &lElems[idxTag];
01255 
01256  /* --- Just show the entries on hash...
01257  if ( x==1 ) return pElem;
01258  printf("DBG::: thisFindTag: '%s', x=%d (iVal=%d)\n",strTag,x,pHashElem->iVal);
01259  for (idxTag=0; idxTag<=90; idxTag++) {
01260      gKey xKey(lElems[idxTag].elemName);
01261      pHashElem = hElems->Find(xKey,idx,x);
01262      printf("DBG::: idxTag=%d, {idx=%d:x=%d}, NAME=%s\n",idxTag,idx,x,lElems[idxTag].elemName);
01263  }
01264  */
01265 
01266  return pElem;
01267 }
01268 
01269 int gHtmlParser::thisInitTbl (t_int16& size)
01270 {
01271  t_int16 idx, iN;
01272  sHtmlElement* pElem;
01273  sAttrDef* pAttrDef;
01274  char* elemName;
01275  char* attrName, *lastName;
01276  char* str;
01277 
01278  if ( size>0 ) return 0;
01279 
01280  if ( hElems==nil ) {
01281      hElems = new gHashTriple( 7591 );
01282      ASSERTION(hElems!=nil,"hElems!=nil");
01283  }
01284  if ( hElems->IsEmpty()==false ) return 0;
01285 
01286  for (idx=0, size=0;
01287       (pElem = &(lElems[idx]))!=nil &&
01288           (elemName = pElem->elemName)!=nil;
01289       idx++) {
01290      size++;
01291      gKey hKey( elemName );
01292      hElems->AddTriple( hKey, (int)idx );
01293      switch ( pElem->family ) {
01294      case '@':
01295          pElem->family = 0;
01296      case (char)e_HtmlElementPhrase:
01297          break;
01298      default:
01299          return -1;
01300      }
01301      // Check optStart is either 'O' or '@': 1 resp. 0
01302      switch ( pElem->optStart ) {
01303      case 'O':
01304          pElem->optStart = 1;
01305          break;
01306      case '@':
01307          pElem->optStart = 0;
01308          break;
01309      default:
01310          ASSERTION_FALSE("optStart(1)");
01311          break;
01312      }
01313      // isDeprecated becomes either 0 or non-zero
01314      char isDeprecated = pElem->isDeprecated;
01315      if ( isDeprecated=='@' ) isDeprecated = pElem->isDeprecated = 0;
01316      // If is deprecated, then for sure kindDTD is 'L'oose,
01317      // but not the otherway round (e.g. IFRAME is Loose, but not deprecated)
01318      if ( pElem->kindDTD=='@' ) pElem->kindDTD = 0;
01319      if ( isDeprecated!=0 && pElem->kindDTD!='L' ) return -1;
01320  }//end FOR
01321 
01322  for (idx=0; (attrName = lAttrs[idx].attrName)!=NULL; ) idx++;
01323 
01324  ASSERTION(lAttrRef==nil,"lAttrRef==nil");  // Un-initialized
01325  lAttrRef = new sAttrRefer( iN = idx );
01326  ASSERTION(lAttrRef!=nil,"lAttrRef!=nil");
01327 
01328  for (idx=0, lastName="@"; idx<iN; idx++) {
01329      pAttrDef = &lAttrs[ idx ];
01330      attrName = pAttrDef->attrName;
01331      gString s( attrName );
01332      s.UpString();
01333      lAttrRef->sAttrs[ idx ] = s;
01334      if ( strcmp( attrName, lastName ) ) {
01335          // New attribute unique name
01336          lAttrRef->idxUniqMax++;
01337          lAttrRef->sLUniqs[ lAttrRef->idxUniqMax ] = s;
01338          lAttrRef->idxLUniqs[ lAttrRef->idxUniqMax ] = idx;
01339      }
01340      elemName = pAttrDef->strRelatedLst;
01341      bool isAllBut = elemName[0]=='@';  // Indicates all tags but these...
01342      elemName += isAllBut;
01343      lAttrRef->lstIsAllButEtc[ idx ].SetOn( isAllBut );
01344      lAttrRef->lstRelated[ idx ].AddFromStr( elemName );
01345 
01346      // Keep last attribute name
01347      lastName = attrName;
01348  }//end FOR
01349 
01350  sAttrNorm* pNormAttr;
01351 
01352  ASSERTION(nDefAttrNorms==-1,"nDefAttrNorms==-1");
01353  for (idx=0, nDefAttrNorms=0;
01354       (pNormAttr = &lDefAttrNorms[idx])!=nil && pNormAttr->idTag!=-1;
01355       idx++) {
01356      str = pNormAttr->strAttrSeq;
01357      if ( str!=nil && str[0]!=0 ) {
01358          pNormAttr->pAttrSeq = new gSmartList;
01359          ASSERTION(pNormAttr->pAttrSeq!=nil,"pNormAttr->pAttrSeq!=nil");
01360          pNormAttr->pAttrSeq->AddFromStr( str );
01361      }
01362      nDefAttrNorms++;
01363  }
01364 
01365  // Now refresh lHAttrTypes (list of attribute types)
01366  for (idx=1;
01367       lHAttrTypes[idx].typeFamily>=0;
01368      ) idx++;
01369 
01370  lHAttrTypes[ 0 ].typeFamily = idx;
01371 
01372  return 0;
01373 }
01374 
01375 int gHtmlParser::thisParse (FILE* fRepErr)
01376 {
01377  int error;
01378  unsigned i, n;
01379  unsigned currLine = 0;
01380  t_int16 id;
01381  gHtmlCouple* pObjC;
01382  sHtmlElement* pElem;
01383  char* str;
01384  char* msgError;
01385  bool isSkippedTag;
01386  bool isInvalidTag;
01387 
01388  if ( pUnweb==nil ) return -1;
01389  if ( thisFillFromUnweb( *pUnweb, htmlInput )!=0 ) return -1;
01390 
01391  // Pre-parsing, setting id's
01392  for (i=1, n=htmlInput.N(); i<=n; i++) {
01393      pObjC = htmlInput.GetCouple( i );
01394      ASSERTION(pObjC!=nil,"pObjC!=nil");
01395      currLine = pObjC->iLine;
01396      if ( pObjC->IsText() ) {
01397          // If pHStr->IsOk() is false, perhaps issue an error?
01398          DBGPRINT_MIN("DBG: [%s]: pHStr:%c OK-pHStr:%c\n",pObjC->Str(1),ISyORn(pObjC->pHStr!=nil),ISyORn(pObjC->pHStr==nil?0:pObjC->pHStr->IsOk()));
01399          continue;
01400      }
01401      str = pObjC->sTag.Str();
01402      /////printf("DBG: STRTAG: %s (%d) synError=%d\n",str,pObjC->idTag,pObjC->synError);
01403      if ( pObjC->theDocType!=0 ) {
01404          // E.g.: HTML PUBLIC -//W3C//DTD HTML 4.01 Transitional//EN http://www.w3.org/TR/html4/loose.dtd
01405          // (Given '...4.01...' string without quotes by attrL.GetAttrValue(3))
01406          DBGPRINT("DBG: DocType: %s [%s]\n",pObjC->attrL.Str(),pObjC->attrL.GetAttrValue(3));
01407          if ( docType==0 ) {
01408              docType = pObjC->theDocType;  // Always 1
01409          }
01410          else {
01411              SetWarn( 20 );
01412              HTML_LOG(fRepErr,LOG_WARNING,"Line %u: %s\n",currLine,"Duplicated DOCTYPE declaration");
01413              SetWarn( 0 );
01414          }
01415      }
01416      pElem = thisFindTag( str, pObjC->idTag );
01417      pObjC->pElem = pElem;
01418      if ( pElem==nil ) {
01419          // Check if tag is configured to be skipped
01420          if ( htmlOpt.outOpt.IsTagSkipped( str ) )
01421              pObjC->synError = XH_SKIP_TAG_FORCE;
01422      }
01423      else {
01424          if ( pElem->ctrl==-1 )
01425              pObjC->synError = XH_SKIP_TAG_FORCE;
01426      }
01427  }
01428 
01429  gHParsed kParsed;
01430  //eHState lastState = kParsed.state;
01431 
01432  // Parsing 'htmlInput'
01433  for (i=1; i<=n; i++) {
01434      pObjC = htmlInput.GetCouple( i );
01435      id = pObjC->idTag;
01436      str = htmlInput.Str( i );
01437      ASSERTION(str!=nil,"str!=nil");
01438      currLine = pObjC->iLine;
01439      if ( pObjC->IsText() ) {
01440          // The state will pass to StartAfter if at least one string is present.
01441          // The state change in this class is an exception.
01442          if ( kParsed.state==e_HS_Start ) kParsed.state = e_HS_StartAfter;
01443          if ( pObjC->pHStr!=nil && pObjC->pHStr->Str()!=nil && pObjC->pHStr->Str()[0]==0 ) {
01444              HTML_LOG(fRepErr,LOG_NOTICE,"Line %u: invalid string, but used: '%s'\n",currLine,str);
01445          }
01446          thisAddedParsedLine( str, *pObjC, kParsed.state, true );
01447      }
01448      else {
01449          error = thisParseLine( *pObjC, currLine, str, kParsed );
01450          msgError = GetErrorStr();
01451          isInvalidTag = lastOpError<0;
01452          isSkippedTag = pObjC->IsSkippedTag();
01453          if ( isSkippedTag ) {
01454              ; // Skip tag
01455              if ( pObjC->synError!=XH_SKIP_TAG_FORCE )
01456                  thisAddedParsedLine( str, *pObjC, kParsed.state, true );
01457              continue;
01458          }
01459          if ( isInvalidTag ) {
01460              nErrorsSyntax++;
01461              HTML_LOG(fRepErr,LOG_ERROR,"Line %u: Invalid tag: %s\n",currLine,str);
01462              if ( htmlOpt.oErrInvTagWrite.IsOn()==false ) continue;
01463          }
01464 
01465          char* strBaseHRef;
01466 
01467          if ( id==XH_IDTAG_HEAD && kParsed.state==e_HS_HeadAfter && htmlOpt.DoBaseHRef()==true ) {
01468              // Build and add new Html-Couple based on given option...
01469              strBaseHRef = htmlOpt.GetBaseHRef();
01470              myBaseHRef.Set( strBaseHRef );
01471              gHtmlCouple newObjC( currLine, "BASE", strBaseHRef );
01472              thisAddedParsedLine( newObjC.GetStr(), newObjC, kParsed.state, true );
01473          }
01474          thisAddedParsedLine( str, *pObjC, kParsed.state, lastOpError>=0 );
01475          if ( lastOpError==0 ) {
01476              if ( lastWarnOpCode==0 ) {
01477                  if ( id==XH_IDTAG_BASE ) {
01478                      strBaseHRef = pObjC->attrL.Find("HREF");
01479                      if ( strBaseHRef!=nil ) myBaseHRef.Set( strBaseHRef );
01480                  }
01481              }
01482              else {
01483                  HTML_LOG(fRepErr,LOG_WARNING,"Line %u: %s\n",currLine,msgError);
01484                  SetWarn( 0 );
01485              }
01486              continue;
01487          }
01488          if ( isInvalidTag==false ) {
01489              nErrorsOther++;
01490              HTML_LOG(fRepErr,LOG_ERROR,"Line %u: bad sequence (int-reg %d): %s\n%s%s%s",
01491                       currLine,
01492                       lastOpError,
01493                       str,
01494                       msgError[0]==0 ? "\0" : "ERROR: ", msgError, msgError[0]==0 ? "\0" : "\n");
01495          }
01496      }//end ELSE IsText
01497  }//end FOR: main iter
01498 
01499  error = nErrorsSyntax!=0 || nErrorsOther!=0;
01500  if ( error!=0 ) return 1;
01501 
01502  n = kParsed.kMust.N();
01503  for (i=1; i<=n; i++) {
01504      pObjC = (gHtmlCouple*)kParsed.kMust.GetObjectPtr( i );
01505      ASSERTION(pObjC!=nil,"pObjC"); //Academic assert
01506      nErrorsOther++;
01507      HTML_LOG(fRepErr,LOG_ERROR,"Line %u: unclosed tag (int-ref %d): %s, opened at line %u\n",
01508               currLine,
01509               10,
01510               pObjC->TagString().Str(),
01511               pObjC->iLine);
01512      DBGPRINT("DBG: ATTRS: %s (%s)\n",pObjC->attrL.Str(),pObjC->attrL.Find("HREF"));
01513  }
01514 
01515  return 0;
01516 }
01517 
01518 int gHtmlParser::thisParseLine (gHtmlCouple& inCouple, unsigned lineNr, char* s, gHParsed& kParsed)
01519 {
01520  // Return <=-8 on syntax error, -1/-2 on skipped tag
01521  // or non-zero for semantic error.
01522  //
01523  int error;
01524  t_int16 id;
01525  sHtmlElement* pInElem;
01526  gHtmlCouple* pCouple;
01527  bool inRefEnd, inCannotEnd, inOptEnd;
01528 
01529  ASSERTION(s!=nil,"s!=nil");
01530  SetError( 0 );
01531  id = inCouple.idTag;
01532  if ( inCouple.IsSkippedTag() ) return inCouple.synError;
01533  if ( id<0 ) return SetError( -8 );
01534  pInElem = inCouple.pElem;
01535 
01536  inRefEnd = inCouple.IsTagEnd();
01537  inCannotEnd = pInElem->CannotEndTag();
01538  inOptEnd = pInElem->MayEndTag();
01539 
01540  DBGPRINT_MIN("DBG: thisParseLine:%u [%s] inCouple=%s\n",lineNr,s,inCouple.sTag.Str());
01541 
01542  if ( inRefEnd ) {
01543      // Input was like '</B>'
01544      if ( inCannotEnd ) return SetError( 12 );  // E.g. '</BR>'
01545      // was it e.g. <A ...><B>XYZ</B></A> ?
01546      if ( kParsed.Depth()<=0 ) return SetError( 13 ); // E.g. </B> without <B>
01547      pCouple = kParsed.CurrentCouple();
01548      if ( id==pCouple->idTag ) {
01549          error = kParsed.PopTag( inOptEnd );
01550          ASSERTION(error==0,"kParsed.PopTag(1)");
01551          return 0;
01552      }
01553      if ( inOptEnd ) {
01554          // One opt-end-tag, and another different opt-end-tag? Check: ...
01555          if ( id!=pCouple->idTag ) {
01556              int countTrash=0;
01557              int iOptIdx, iOptN=(int)kParsed.N();
01558              for (iOptIdx=iOptN; iOptIdx>0 && countTrash==0; iOptIdx--) {
01559                  gHtmlCouple* pObjC = kParsed.GetCouple( (unsigned)iOptIdx );
01560                  ///printf("DBG: ###CURm: "); for (unsigned dbgIdx=1; dbgIdx<=kParsed.kMust.N(); dbgIdx++) printf("[%s]",kParsed.StrMust(dbgIdx)); printf("DBG: ### line %u:(%s).\n",((gHtmlCouple*)kParsed.kMust.EndPtr()->me)->iLine,((gHtmlCouple*)kParsed.kMust.EndPtr()->me)->sTag.Str());
01561                  ///printf("DBG: ###CURo: "); for (unsigned dbgIdx=1; dbgIdx<=kParsed.N(); dbgIdx++) printf("[%s]",((gHtmlCouple*)kParsed.GetObjectPtr(dbgIdx))->GetStr()); printf("DBG: ### line %u:(%s).\n",((gHtmlCouple*)kParsed.EndPtr()->me)->iLine,((gHtmlCouple*)kParsed.EndPtr()->me)->sTag.Str());
01562                  ///printf("DBG: sweep(%d|%d): %s (N=%u kMustN=%u)\n",id,pObjC->idTag,kParsed.CurrentCouple()->sTag.Str(),kParsed.N(),kParsed.kMust.N());
01563                  if ( id==pObjC->idTag ) {
01564                      kParsed.Delete( iOptIdx, iOptIdx );
01565                      countTrash++;
01566                  }
01567              }// end FOR iOptIdx
01568          }
01569      }
01570      else {
01571          // Here something like '</A>' must pop any pending optional-ended tags
01572          while ( kParsed.N()>kParsed.kMust.N() ) {
01573              pCouple = kParsed.CurrentCouple();
01574              if ( id==pCouple->idTag ) break;
01575              kParsed.TrashLast();
01576          }
01577      }
01578      // Update pointer
01579      pCouple = kParsed.CurrentCouple();
01580      // Now check consistency of current 'id', and the stack (kParsed)
01581      if ( id!=pCouple->idTag ) {
01582          //printf("DBG: WOW optend?%d (id={%d:%s},current={%d:%s}): ",inOptEnd,id,s,pCouple->idTag,pCouple->sTag.Str()); kParsed.Show(true); printf("STK_MUST IS: "); kParsed.kMust.Show(true);
01583          SetError( 14 ); // E.g. </B> ending <A>
01584          snprintf(sStrError,200,
01585                   "Expected %s [line %u] but got %s",
01586                   pCouple->TagString(true).Str(),
01587                   pCouple->iLine,
01588                   s);
01589          return 14;
01590      }
01591      // All ok, e.g. <B> ends here with </B>
01592      error = kParsed.PopTag( inOptEnd );
01593      ASSERTION(error==0,"kParsed.PopTag(2)");
01594  }
01595  else {
01596      // Three cases:
01597      // 1. plain '<BR>' (no end)
01598      // 2. optional-ended tag (e.g. <DD>)
01599      // 3. must-be-ended tag (e.g. <B>)
01600      if ( inOptEnd==true && inCannotEnd==false ) {
01601          // 2nd case
01602          kParsed.PushTagOptEnd( inCouple );
01603          return 0;
01604      }
01605      // 1st/3rd case
01606      // Note: 1st case: inCannotEnd=true
01607      //
01608      // Error when issuing <A>, ..., <A> (without ending the first anchor)
01609      //
01610      if ( id==XH_IDTAG_ANCHOR ) {
01611          gHtmlCouple* pCoupleAnchor = kParsed.FindCouple( XH_IDTAG_ANCHOR );
01612          if ( pCoupleAnchor!=nil ) {
01613              error = 129;
01614              SetError( error );
01615              snprintf(sStrError,200,
01616                       "Doubled anchor on line %u: already started on line %u (int-seq:%d)",
01617                       inCouple.iLine,
01618                       pCoupleAnchor->iLine,
01619                       error);
01620              return error;
01621          }
01622      }
01623      // Now pushing
01624      error = kParsed.PushTag( inCouple, inCannotEnd );
01625 
01626      if ( error==0 ) return 0;
01627      if ( error<0 ) {
01628          if ( htmlOpt.oErrStateAllSuppress.IsOn() )
01629              return 0;
01630          switch ( error ) {
01631          case -103:
01632              if ( htmlOpt.oErrStateSuppress[e_HS_HeadAfter].IsOn() )
01633                  return 0;
01634              break;
01635          case -104:
01636              if ( htmlOpt.oErrStateSuppress[e_HS_Html].IsOn() )
01637                  return 0;
01638              break;
01639          case -105:
01640              if ( htmlOpt.oErrStateSuppress[e_HS_Start].IsOn() )
01641                  return 0;
01642              break;
01643          case -106:
01644              if ( htmlOpt.oErrStateSuppress[e_HS_Body].IsOn() )
01645                  return 0;
01646          default:
01647              break;
01648          }
01649          SetWarn( 21 );
01650          snprintf(sStrError,200,
01651                   "Not in LTD sequence: %s (int-seq:%d)",
01652                   s,
01653                   -error);
01654      }// end IF error<0
01655      else {
01656          // ...then error>0
01657          return SetError( error );
01658      }
01659  }
01660  return 0;
01661 }
01662 
01663 int gHtmlParser::thisAddedParsedLine (char* s, gHtmlCouple& inCouple, eHState state, bool doAccept)
01664 {
01665  ASSERTION(s!=nil,"s!=nil");
01666  if ( s[0]==0 ) return 0;
01667  if ( fOutAll.f!=nil ) fprintf(fOutAll.f,"%s%s",s,fOutAll.NewLine());
01668  lOut.Add( s );
01669  lParts[state].AppendCouple( inCouple );
01670  return 0;
01671 }
01672 ////////////////////////////////////////////////////////////
01673 

Generated on Sat Aug 18 02:40:55 2007 for xpfweb_v2x lib by  doxygen 1.4.2