00001 #ifndef gHTML_PARSER_X_H
00002 #define gHTML_PARSER_X_H
00003
00004 #include "gweb.h"
00005 #include "ghash.h"
00006 #include "glistext.h"
00007 #include "gstack.h"
00008
00009 #include "gHtmlOpt.h"
00010 #include "gHtmlAttr.h"
00011 #include "gHtmlHATypes.h"
00012 #include "gdNetStrings.h"
00013
00014 #define XH_NOTAG -9
00015 #define XH_ENDTAG -11
00016 #define XH_TBL_TAG_OPTEND 'O'
00017 #define XH_TBL_TAG_OPTEND_CFG 'o'
00018 #define XH_SKIP_TAG -1
00019 #define XH_SKIP_TAG_FORCE -2
00020
00021 #define XH_SKIP_ATTR -1
00022
00023
00024 #define XH_IDTAG_ANCHOR 0
00025 #define XH_IDTAG_BASE 7
00026 #define XH_IDTAG_BODY 12
00027 #define XH_IDTAG_FONTx 30
00028 #define XH_IDTAG_H1 34
00029 #define XH_IDTAG_HL 39
00030 #define XH_IDTAG_HEAD 40
00031 #define XH_IDTAG_HTML 42
00032 #define XH_IDTAG_IMG 45
00033 #define XH_IDTAG_TABLEx 78
00034
00035 enum eHtmlElementFamily {
00036 e_HtmlElementPhrase = 1
00037 };
00038
00039 struct sHtmlElement {
00040 t_int16 ctrl;
00041 char* elemName;
00042 char* elemInfo;
00043 char optStart;
00044 char optEnd;
00045 char isDeprecated;
00046 char kindDTD;
00047 char family;
00048
00049 bool CannotEndTag () {
00050 return optEnd=='F';
00051 }
00052 bool MayEndTag () {
00053 return (optEnd==XH_TBL_TAG_OPTEND || optEnd==XH_TBL_TAG_OPTEND_CFG) && optStart==0;
00054 }
00055 bool IsDeprecated () {
00056 return isDeprecated!=0;
00057 }
00058 short DeprecatedLevel () {
00059 return (short)(isDeprecated-'A');
00060 }
00061 bool IsFrameset () {
00062 return kindDTD=='F';
00063 }
00064 bool IsLoose () {
00065 return kindDTD=='L';
00066 }
00067 };
00068
00069 struct sAttrDef {
00070 t_int16 ctrl;
00071 char* attrName;
00072 char* strRelatedLst;
00073 char* strType;
00074 char* strImplReq;
00075 char chrDeprecated;
00076 char* strKindDTD;
00077 char* attrInfo;
00078
00079 bool IsDeprecated () {
00080 return chrDeprecated=='D';
00081 }
00082 };
00083
00084 struct sAttrRefer {
00085 sAttrRefer (t_int16 nAttrs) ;
00086 ~sAttrRefer () ;
00087
00088 t_int16 nlAttrs;
00089 gString* sAttrs;
00090 gSmartList* lstRelated;
00091 gSwitch* lstIsAllButEtc;
00092 t_int16 idxUniqMax;
00093 gString* sLUniqs;
00094 t_int16* idxLUniqs;
00095 gString sOutHelper;
00096
00097 t_int16 FindAttr (char* attrName) {
00098
00099 t_int16 uniqIdx;
00100 return FindAttr( attrName, uniqIdx );
00101 }
00102
00103 t_int16 FindAttr (char* attrName, t_int16& uniqIdx) ;
00104
00105 t_int16 FindAttr (char* attrName, char* strTag) {
00106 t_int16 uniqIdx;
00107 return FindAttr( attrName, strTag, uniqIdx );
00108 }
00109
00110 t_int16 FindAttr (char* attrName, char* strTag, t_int16& uniqIdx) ;
00111 };
00112
00113 class gHtmlCouple : public gList {
00114 public:
00115 gHtmlCouple (unsigned lineNr, char* sText) ;
00116 gHtmlCouple (unsigned lineNr, char* strTag, char* sAttrLst, bool doAddSkippedTags=true) ;
00117 virtual ~gHtmlCouple () ;
00118
00119
00120 unsigned iLine;
00121 t_int16 idTag;
00122 t_int16 idEndTag;
00123 gString sTag;
00124 gHtmlString* pHStr;
00125 sHtmlElement* pElem;
00126 gHAttrList attrL;
00127 int synError;
00128 t_int16 coupleId;
00129 t_int16 theDocType;
00130
00131 gHtmlCouple* oCouple;
00132 gTagCoord tcoord;
00133
00134
00135 virtual bool IsOk () ;
00136 virtual bool IsText () {
00137 return idTag==XH_NOTAG;
00138 }
00139 virtual bool IsTagEnd () {
00140 return idEndTag==XH_ENDTAG;
00141 }
00142 virtual bool IsAnchor () {
00143 return idTag==0;
00144 }
00145 virtual bool IsSkippedTag () {
00146 return synError==XH_SKIP_TAG || synError==XH_SKIP_TAG_FORCE;
00147 }
00148
00149 virtual char* GetStr () ;
00150 virtual char* GetStrForTree () ;
00151 virtual gString& TagString (bool forceEnd=false) ;
00152
00153 virtual char* GetHRef () ;
00154
00155
00156 unsigned Add (char* s) ;
00157 unsigned Add (gString& copy) ;
00158 virtual unsigned AddText (char* s) ;
00159 virtual unsigned AddTag (char* strTag) ;
00160
00161 virtual void CopyTag (gHtmlCouple& copy) {
00162 Reset();
00163 sTag = copy.sTag;
00164 idTag = copy.idTag;
00165 idEndTag = copy.idEndTag;
00166 pElem = copy.pElem;
00167 attrL.CopyAttr( copy.attrL );
00168 synError = copy.synError;
00169 coupleId = copy.coupleId;
00170 theDocType = copy.theDocType;
00171 if ( copy.IsText() ) AddText( copy.Str(1) );
00172 }
00173
00174 protected:
00175 gString sTagStr;
00176 gString sWholeTag;
00177 gString sKeepStr;
00178
00179 private:
00180
00181 gHtmlCouple (gHtmlCouple& ) ;
00182 gHtmlCouple& operator= (gHtmlCouple& ) ;
00183 };
00184
00185 class gHtmlContent : public gList {
00186 public:
00187 gHtmlContent () ;
00188 virtual ~gHtmlContent () ;
00189
00190
00191 virtual gHtmlCouple* GetCouple (unsigned idx) ;
00192 virtual char* Str (unsigned idx) ;
00193
00194
00195 unsigned Add (char* s) ;
00196 virtual int UpCaseAttributes (char* strTag, gString& sRes) ;
00197
00198
00199 virtual gHtmlOpt& GetHtmlOpt () ;
00200 virtual bool SetHtmlOpt (gHtmlOpt* pHtmlOpt) ;
00201 virtual int TagError (unsigned lineNr, int error, char* sLine, char* sShortMsg) ;
00202
00203
00204 virtual void Show (bool doShowAll=true) ;
00205
00206 protected:
00207 unsigned nLines;
00208 gHtmlOpt* theHtmlOpt;
00209
00210 int thisAddHmtlLine (unsigned lineNr, char* s) ;
00211 int thisAddHtmlText (unsigned lineNr, char* sText) ;
00212 int thisAddHtmlTag (unsigned lineNr, char* strTag, bool isEndTag) ;
00213 int thisAddCouple (gHtmlCouple* pCouple, gList& oL) ;
00214
00215 private:
00216
00217 gHtmlContent (gHtmlContent& ) ;
00218 gHtmlContent& operator= (gHtmlContent& ) ;
00219 };
00220
00221 class gHParsed : public gStack {
00222 public:
00223
00224
00225 gHParsed ()
00226 : state( e_HS_Start ),
00227 hasBaseHRef( false ) {
00228 }
00229 virtual ~gHParsed () {
00230 }
00231
00232 static const char* tblStateStr[e_HS_Last];
00233
00234
00235 eHState state;
00236 bool hasBaseHRef;
00237 gStack kMust;
00238
00239
00240 virtual int Depth () {
00241 return (int)N();
00242 }
00243
00244 virtual char* Str (unsigned idx) ;
00245 virtual char* StrMust (unsigned idx) ;
00246
00247 virtual gHtmlCouple* CurrentCouple () ;
00248 virtual gHtmlCouple* GetCouple (unsigned idx) ;
00249
00250 virtual gHtmlCouple* FindCouple (t_int16 idTag) ;
00251
00252
00253 virtual void PushTagOptEnd (gHtmlCouple& couple) ;
00254 virtual int PushTag (gHtmlCouple& couple, bool doCheckOnly=false) ;
00255 virtual int PopTag (bool hasOptEnd) ;
00256 virtual void TrashLast () ;
00257
00258 protected:
00259 int thisPush (gHtmlCouple& couple, gStack& aStack) ;
00260 int thisPushCouple (gHtmlCouple& couple, bool hasOptEnd) ;
00261
00262 private:
00263
00264 gHParsed (gHParsed& ) ;
00265 gHParsed& operator= (gHParsed &) ;
00266 };
00267
00268 class gHList : public gHtmlContent {
00269 public:
00270
00271 gHList ()
00272 : coupleFakeBody( nil ) {
00273 }
00274 virtual ~gHList () {
00275 delete coupleFakeBody;
00276 }
00277
00278
00279 gHtmlCouple* coupleFakeBody;
00280
00281
00282 virtual gHtmlCouple* GetCouple (unsigned idx) ;
00283
00284
00285 virtual void AppendCouple (gHtmlCouple& couple) ;
00286
00287 private:
00288
00289 gHList (gHList& ) ;
00290 gHList& operator= (gHList& ) ;
00291 };
00292
00293 class gHtmlParser : public gControl {
00294 public:
00295 gHtmlParser (gUnweb* ptrUnweb) ;
00296 virtual ~gHtmlParser () ;
00297
00298
00299 gFileOut fOutAll;
00300 t_int16 docType;
00301 int nErrorsSyntax;
00302 int nErrorsOther;
00303 int nWarnings;
00304 int lastWarnOpCode;
00305 gHtmlOpt htmlOpt;
00306 gList lOut;
00307 gHList lParts[e_HS_Last];
00308
00309
00310 virtual t_int16 GetTagMaxId () {
00311 return nElems;
00312 }
00313
00314 virtual sHtmlElement* GetTagElement (t_int16 idxTag) ;
00315
00316 virtual sHtmlElement* FindTag (char* strTag) {
00317 t_int16 idxTag;
00318 return thisFindTag( strTag, idxTag );
00319 }
00320
00321 virtual int FindTagIdx (char* strTag, t_int16& idxTag) {
00322 int val;
00323 thisFindTag( strTag, idxTag );
00324 val = (int)idxTag;
00325 return val;
00326 }
00327
00328 virtual sAttrDef* GetAttrDef (t_int16 idxAttr) ;
00329
00330 virtual sAttrRefer* GetAttrRef () ;
00331
00332 virtual sAttrNorm* GetDefaultAttrNorm (t_int16 idxNorm) ;
00333
00334 virtual t_int16 GetNAttrType () {
00335 return lHAttrTypes[ 0 ].typeFamily;
00336 }
00337
00338 virtual sHAttrType* GetAttrType (t_int16 idxType) ;
00339
00340
00341 virtual void ReleaseHash () ;
00342 virtual bool SetOptions (gHtmlOpt& copy) ;
00343
00344
00345 virtual int Parse (FILE* fRepErr) ;
00346 virtual int SetError (int opError) ;
00347 virtual int SetWarn (int opError) ;
00348
00349
00350 int ShowTree (FILE* fRepErr) ;
00351 int Show_dbg (bool doShowAll=true) ;
00352
00353 protected:
00354 gString myBaseHRef;
00355 gUnweb* pUnweb;
00356 gHtmlContent htmlInput;
00357
00358
00359 int thisFillFromUnweb (gUnweb& unweb, gHtmlContent& hInput) ;
00360 sHtmlElement* thisFindTag (char* strTag, t_int16& idxTag) ;
00361 int thisParse (FILE* fRepErr) ;
00362
00363 private:
00364 static t_int16 nElems;
00365 static sHtmlElement lElems[];
00366 static sAttrDef lAttrs[];
00367 static sAttrRefer* lAttrRef;
00368 static t_int16 nDefAttrNorms;
00369 static sAttrNorm lDefAttrNorms[];
00370 static sHAttrType lHAttrTypes[];
00371
00372 static gHashTriple* hElems;
00373
00374
00375
00376 int thisInitTbl (t_int16& size) ;
00377 int thisParseLine (gHtmlCouple& inCouple, unsigned lineNr, char* s, gHParsed& kParsed) ;
00378 int thisAddedParsedLine (char* s, gHtmlCouple& inCouple, eHState state, bool doAccept) ;
00379
00380
00381 gHtmlParser (gHtmlParser& ) ;
00382 gHtmlParser& operator= (gHtmlParser& ) ;
00383 };
00384
00385 #endif //gHTML_PARSER_X_H
00386