gHtmlParser.h

Go to the documentation of this file.
00001 #ifndef gHTML_PARSER_X_H
00002 #define gHTML_PARSER_X_H
00003 
00004 #include "gweb.h"
00005 #include "ghash.h"
00006 #include "glistext.h"
00007 #include "gstack.h"
00008 
00009 #include "gHtmlOpt.h"
00010 #include "gHtmlAttr.h"
00011 #include "gHtmlHATypes.h"
00012 #include "gdNetStrings.h"
00013 
00014 #define XH_NOTAG -9
00015 #define XH_ENDTAG -11
00016 #define XH_TBL_TAG_OPTEND 'O'
00017 #define XH_TBL_TAG_OPTEND_CFG 'o'
00018 #define XH_SKIP_TAG -1
00019 #define XH_SKIP_TAG_FORCE -2
00020 
00021 #define XH_SKIP_ATTR -1
00022 
00023 // Definitions below must be adjusted if 'lElems' changes
00024 #define XH_IDTAG_ANCHOR 0
00025 #define XH_IDTAG_BASE 7
00026 #define XH_IDTAG_BODY 12
00027 #define XH_IDTAG_FONTx 30
00028 #define XH_IDTAG_H1 34
00029 #define XH_IDTAG_HL 39
00030 #define XH_IDTAG_HEAD 40
00031 #define XH_IDTAG_HTML 42
00032 #define XH_IDTAG_IMG 45
00033 #define XH_IDTAG_TABLEx 78
00034 ////////////////////////////////////////////////////////////
00035 enum eHtmlElementFamily {
00036     e_HtmlElementPhrase = 1
00037 };
00038 
00039 struct sHtmlElement {
00040     t_int16 ctrl;   // zero(default): in use; 1: tidy; -1: skipped
00041     char* elemName;
00042     char* elemInfo;
00043     char optStart;  // letter-O: may not be present, or may not end (e.g. <BODY>) ::0|1
00044     char optEnd;    // letter-O: may not have end tag; F: must be uniquely starting (e.g. <BR>)
00045     char isDeprecated;  // D: deprecated (not allowed in transitional)
00046     char kindDTD;       // L: Loose DTD; F: belongs to frameset (i.e. frame elements, like <FRAME>)
00047     char family;
00048 
00049     bool CannotEndTag () {
00050         return optEnd=='F';
00051     }
00052     bool MayEndTag () {
00053         return (optEnd==XH_TBL_TAG_OPTEND || optEnd==XH_TBL_TAG_OPTEND_CFG) && optStart==0;
00054     }
00055     bool IsDeprecated () {
00056         return isDeprecated!=0;
00057     }
00058     short DeprecatedLevel () {
00059         return (short)(isDeprecated-'A');
00060     }
00061     bool IsFrameset () {
00062         return kindDTD=='F';
00063     }
00064     bool IsLoose () {
00065         return kindDTD=='L';
00066     }
00067 };
00068 ////////////////////////////////////////////////////////////
00069 struct sAttrDef {
00070     t_int16 ctrl;   // zero(default): in use; 1: tidy; -1: skipped
00071     char* attrName;
00072     char* strRelatedLst;
00073     char* strType;
00074     char* strImplReq;  // #IMPLIED, #REQUIRED, DATA, "%str", or "1"
00075     char chrDeprecated;
00076     char* strKindDTD;
00077     char* attrInfo;
00078 
00079     bool IsDeprecated () {
00080         return chrDeprecated=='D';
00081     }
00082 };
00083 
00084 struct sAttrRefer {
00085     sAttrRefer (t_int16 nAttrs) ;
00086     ~sAttrRefer () ;
00087 
00088     t_int16 nlAttrs;
00089     gString* sAttrs;         // [0:first attribute name, all-uppercase; (nlAttrs-1):last]
00090     gSmartList* lstRelated;  // [0:first attribute 'related elements'; (nlAttrs-1):last]
00091     gSwitch* lstIsAllButEtc; // see above, when related are 'all but' these elements
00092     t_int16 idxUniqMax;  // Maximum number of index for unique names (of attrs)
00093     gString* sLUniqs;    // List of unique names (majorated nlAttrs)
00094     t_int16* idxLUniqs;  // Indexes (see above)
00095     gString sOutHelper;  // Output helper string
00096 
00097     t_int16 FindAttr (char* attrName) {
00098         // Returns the first absolute index (for lAttrRef)
00099         t_int16 uniqIdx;
00100         return FindAttr( attrName, uniqIdx );
00101     }
00102 
00103     t_int16 FindAttr (char* attrName, t_int16& uniqIdx) ;
00104 
00105     t_int16 FindAttr (char* attrName, char* strTag) {
00106         t_int16 uniqIdx;
00107         return FindAttr( attrName, strTag, uniqIdx );
00108     }
00109 
00110     t_int16 FindAttr (char* attrName, char* strTag, t_int16& uniqIdx) ;
00111 };
00112 ////////////////////////////////////////////////////////////
00113 class gHtmlCouple : public gList {
00114 public:
00115     gHtmlCouple (unsigned lineNr, char* sText) ;
00116     gHtmlCouple (unsigned lineNr, char* strTag, char* sAttrLst, bool doAddSkippedTags=true) ;
00117     virtual ~gHtmlCouple () ;
00118 
00119     // Public data-members
00120     unsigned iLine;
00121     t_int16 idTag;  // XH_NOTAG=-9 for text
00122     t_int16 idEndTag;
00123     gString sTag;   // empty for text
00124     gHtmlString* pHStr;  // nil for non-text
00125     sHtmlElement* pElem; // nil for text (never allocated)
00126     gHAttrList attrL;    // attribute list
00127     int synError;        // -1: skipped tag, 1 basically invalid tag
00128     t_int16 coupleId;    // only used for coupling
00129     t_int16 theDocType;
00130     // Special members
00131     gHtmlCouple* oCouple;
00132     gTagCoord tcoord;
00133 
00134     // Get methods
00135     virtual bool IsOk () ;
00136     virtual bool IsText () {
00137         return idTag==XH_NOTAG;
00138     }
00139     virtual bool IsTagEnd () {
00140         return idEndTag==XH_ENDTAG;
00141     }
00142     virtual bool IsAnchor () {
00143         return idTag==0;
00144     }
00145     virtual bool IsSkippedTag () {
00146         return synError==XH_SKIP_TAG || synError==XH_SKIP_TAG_FORCE;
00147     }
00148 
00149     virtual char* GetStr () ;
00150     virtual char* GetStrForTree () ;
00151     virtual gString& TagString (bool forceEnd=false) ;
00152 
00153     virtual char* GetHRef () ;
00154 
00155     // Set methods
00156     unsigned Add (char* s) ;
00157     unsigned Add (gString& copy) ;
00158     virtual unsigned AddText (char* s) ;
00159     virtual unsigned AddTag (char* strTag) ;
00160 
00161     virtual void CopyTag (gHtmlCouple& copy) {
00162         Reset();
00163         sTag = copy.sTag;
00164         idTag = copy.idTag;
00165         idEndTag = copy.idEndTag;
00166         pElem = copy.pElem;
00167         attrL.CopyAttr( copy.attrL );
00168         synError = copy.synError;
00169         coupleId = copy.coupleId;
00170         theDocType = copy.theDocType;
00171         if ( copy.IsText() ) AddText( copy.Str(1) );
00172     }
00173 
00174 protected:
00175     gString sTagStr;
00176     gString sWholeTag;
00177     gString sKeepStr;
00178 
00179 private:
00180     // Operators,empty
00181     gHtmlCouple (gHtmlCouple& ) ; //empty
00182     gHtmlCouple& operator= (gHtmlCouple& ) ; //empty
00183 };
00184 ////////////////////////////////////////////////////////////
00185 class gHtmlContent : public gList {
00186 public:
00187     gHtmlContent () ;
00188     virtual ~gHtmlContent () ;
00189 
00190     // Get methods
00191     virtual gHtmlCouple* GetCouple (unsigned idx) ;
00192     virtual char* Str (unsigned idx) ;
00193 
00194     // Set methods
00195     unsigned Add (char* s) ;
00196     virtual int UpCaseAttributes (char* strTag, gString& sRes) ;
00197 
00198     // Specific methods
00199     virtual gHtmlOpt& GetHtmlOpt () ;
00200     virtual bool SetHtmlOpt (gHtmlOpt* pHtmlOpt) ;
00201     virtual int TagError (unsigned lineNr, int error, char* sLine, char* sShortMsg) ;
00202 
00203     // Show methods
00204     virtual void Show (bool doShowAll=true) ;
00205 
00206 protected:
00207     unsigned nLines;
00208     gHtmlOpt* theHtmlOpt;
00209 
00210     int thisAddHmtlLine (unsigned lineNr, char* s) ;
00211     int thisAddHtmlText (unsigned lineNr, char* sText) ;
00212     int thisAddHtmlTag (unsigned lineNr, char* strTag, bool isEndTag) ;
00213     int thisAddCouple (gHtmlCouple* pCouple, gList& oL) ;
00214 
00215 private:
00216     // Operators,empty
00217     gHtmlContent (gHtmlContent& ) ; //empty
00218     gHtmlContent& operator= (gHtmlContent& ) ; //empty
00219 };
00220 ////////////////////////////////////////////////////////////
00221 class gHParsed : public gStack {
00222 public:
00223     // Stack-like class for all pending-closed HTML tags
00224 
00225     gHParsed ()
00226         : state( e_HS_Start ),
00227           hasBaseHRef( false ) {
00228     }
00229     virtual ~gHParsed () {
00230     }
00231 
00232     static const char* tblStateStr[e_HS_Last];
00233 
00234     // Public data-members
00235     eHState state;
00236     bool hasBaseHRef;
00237     gStack kMust;  // Must-end tags stack
00238 
00239     // Get methods
00240     virtual int Depth () {
00241         return (int)N();
00242     }
00243 
00244     virtual char* Str (unsigned idx) ;
00245     virtual char* StrMust (unsigned idx) ;
00246 
00247     virtual gHtmlCouple* CurrentCouple () ;
00248     virtual gHtmlCouple* GetCouple (unsigned idx) ;
00249 
00250     virtual gHtmlCouple* FindCouple (t_int16 idTag) ;
00251 
00252     // Set methods
00253     virtual void PushTagOptEnd (gHtmlCouple& couple) ;
00254     virtual int PushTag (gHtmlCouple& couple, bool doCheckOnly=false) ;
00255     virtual int PopTag (bool hasOptEnd) ;
00256     virtual void TrashLast () ;
00257 
00258 protected:
00259     int thisPush (gHtmlCouple& couple, gStack& aStack) ;
00260     int thisPushCouple (gHtmlCouple& couple, bool hasOptEnd) ;
00261 
00262 private:
00263     // Operators,empty
00264     gHParsed (gHParsed& ) ; //empty
00265     gHParsed& operator= (gHParsed &) ; //empty
00266 };
00267 ////////////////////////////////////////////////////////////
00268 class gHList : public gHtmlContent {
00269 public:
00270     // gHList class suits for parts of HTML, as a list
00271     gHList ()
00272         : coupleFakeBody( nil ) {
00273     }
00274     virtual ~gHList () {
00275         delete coupleFakeBody;
00276     }
00277 
00278     // Public data-members
00279     gHtmlCouple* coupleFakeBody;
00280 
00281     // Get methods
00282     virtual gHtmlCouple* GetCouple (unsigned idx) ;
00283 
00284     // Set methods
00285     virtual void AppendCouple (gHtmlCouple& couple) ;
00286 
00287 private:
00288     // Operators,empty
00289     gHList (gHList& ) ; //empty
00290     gHList& operator= (gHList& ) ; //empty
00291 };
00292 ////////////////////////////////////////////////////////////
00293 class gHtmlParser : public gControl {
00294 public:
00295     gHtmlParser (gUnweb* ptrUnweb) ;
00296     virtual ~gHtmlParser () ;
00297 
00298     // Public data-members
00299     gFileOut fOutAll;
00300     t_int16 docType;
00301     int nErrorsSyntax;
00302     int nErrorsOther;
00303     int nWarnings;
00304     int lastWarnOpCode;
00305     gHtmlOpt htmlOpt;
00306     gList lOut;
00307     gHList lParts[e_HS_Last];
00308 
00309     // Get methods
00310     virtual t_int16 GetTagMaxId () {
00311         return nElems;
00312     }
00313 
00314     virtual sHtmlElement* GetTagElement (t_int16 idxTag) ;
00315 
00316     virtual sHtmlElement* FindTag (char* strTag) {
00317         t_int16 idxTag;
00318         return thisFindTag( strTag, idxTag );
00319     }
00320 
00321     virtual int FindTagIdx (char* strTag, t_int16& idxTag) {
00322         int val;
00323         thisFindTag( strTag, idxTag );
00324         val = (int)idxTag;
00325         return val;
00326     }
00327 
00328     virtual sAttrDef* GetAttrDef (t_int16 idxAttr) ;
00329 
00330     virtual sAttrRefer* GetAttrRef () ;
00331 
00332     virtual sAttrNorm* GetDefaultAttrNorm (t_int16 idxNorm) ;
00333 
00334     virtual t_int16 GetNAttrType () {
00335         return lHAttrTypes[ 0 ].typeFamily;
00336     }
00337 
00338     virtual sHAttrType* GetAttrType (t_int16 idxType) ;
00339 
00340     // Set methods
00341     virtual void ReleaseHash () ;
00342     virtual bool SetOptions (gHtmlOpt& copy) ;
00343 
00344     // Specific methods
00345     virtual int Parse (FILE* fRepErr) ;
00346     virtual int SetError (int opError) ;
00347     virtual int SetWarn (int opError) ;
00348 
00349     // Show & debug methods
00350     int ShowTree (FILE* fRepErr) ;
00351     int Show_dbg (bool doShowAll=true) ;
00352 
00353 protected:
00354     gString myBaseHRef;
00355     gUnweb* pUnweb;
00356     gHtmlContent htmlInput;
00357 
00358     // These-methods
00359     int thisFillFromUnweb (gUnweb& unweb, gHtmlContent& hInput) ;
00360     sHtmlElement* thisFindTag (char* strTag, t_int16& idxTag) ;
00361     int thisParse (FILE* fRepErr) ;
00362 
00363 private:
00364     static t_int16 nElems;
00365     static sHtmlElement lElems[];
00366     static sAttrDef lAttrs[];
00367     static sAttrRefer* lAttrRef;
00368     static t_int16 nDefAttrNorms;
00369     static sAttrNorm lDefAttrNorms[];
00370     static sHAttrType lHAttrTypes[];
00371 
00372     static gHashTriple* hElems;
00373     // Note: hElems should not be an allocated object (but rather a pointer),
00374     // otherwise the gTop collection will return an error.
00375 
00376     int thisInitTbl (t_int16& size) ;
00377     int thisParseLine (gHtmlCouple& inCouple, unsigned lineNr, char* s, gHParsed& kParsed) ;
00378     int thisAddedParsedLine (char* s, gHtmlCouple& inCouple, eHState state, bool doAccept) ;
00379 
00380     // Operators,empty
00381     gHtmlParser (gHtmlParser& ) ; //empty
00382     gHtmlParser& operator= (gHtmlParser& ) ; //empty
00383 };
00384 ////////////////////////////////////////////////////////////
00385 #endif //gHTML_PARSER_X_H
00386 

Generated on Sat Aug 18 02:40:56 2007 for xpfweb_v2x lib by  doxygen 1.4.2