gweb.cpp

Go to the documentation of this file.
00001 // gweb.cpp -- Version 0.2
00002 
00003 #include <string.h>
00004 #include "gweb.h"
00005 #include "gstringext.h"
00006 
00007 ////////////////////////////////////////////////////////////
00008 // Static members
00009 // ---------------------------------------------------------
00010 // void!
00011 
00012 ////////////////////////////////////////////////////////////
00013 gIntCoord::gIntCoord ()
00014     : iAux( 0 ),
00015       y( 0 ),
00016       z( 0 )
00017 {
00018 }
00019 
00020 gIntCoord::gIntCoord (int ax, int ay)
00021     : gInt( ax ),
00022       iAux( 0 ),
00023       y( ay ),
00024       z( 0 )
00025 {
00026 }
00027 
00028 gIntCoord::gIntCoord (int ax, int ay, int az)
00029     : gInt( ax ),
00030       iAux( 0 ),
00031       y( ay ),
00032       z( az )
00033 {
00034 }
00035 
00036 gStorage* gIntCoord::NewObject ()
00037 {
00038  gIntCoord* a = new gIntCoord( c, y, z );
00039  return a;
00040 }
00041 
00042 t_uchar* gIntCoord::ToString (t_uchar* uBuf)
00043 {
00044  if ( uBuf==nil ) return nil;
00045  if ( z!=0 )
00046      sprintf( (char*)uBuf, "[%d,%d,%d]", c, y, z );
00047  else
00048      sprintf( (char*)uBuf, "[%d,%d]", c, y );
00049  return uBuf;
00050 }
00051 
00052 gIntCoord& gIntCoord::operator= (gIntCoord& copy)
00053 {
00054  iAux = copy.iAux;
00055  c = copy.GetX();
00056  y = copy.GetY();
00057  z = copy.GetZ();
00058  return *this;
00059 }
00060 
00061 void gIntCoord::Show (bool doShowAll)
00062 {
00063  printf("[%d,%d]%s",c,y,doShowAll?"\n":"\0");
00064 }
00065 ////////////////////////////////////////////////////////////
00066 gTagCoord::gTagCoord ()
00067     : opId( 0 ),
00068       depth( 0 )
00069 {
00070 }
00071 
00072 gTagCoord::~gTagCoord ()
00073 {
00074 }
00075 
00076 void gTagCoord::Reset ()
00077 {
00078  opId = 0;
00079  depth = 0;
00080  gIntCoord::Reset();
00081 }
00082 
00083 bool gTagCoord::SetZ (int az)
00084 {
00085  bool assignOk = az>GetY();
00086  ASSERTION(assignOk,"assignOk");
00087  return gIntCoord::SetZ( az ) && assignOk;
00088 }
00089 ////////////////////////////////////////////////////////////
00090 bool sCoordText::IsOk ()
00091 {
00092  bool isOk = coordL.N()==textL.N();
00093  ASSERTION(isOk,"coordL.N()==textL.N()");
00094  return isOk;
00095 }
00096 
00097 unsigned sCoordText::N ()
00098 {
00099  if ( IsOk()==false ) return 0;
00100  return coordL.N();
00101 }
00102 
00103 char* sCoordText::Str (unsigned idx)
00104 {
00105  return textL.Str( idx );
00106 }
00107 
00108 gIntCoord& sCoordText::GetCoord (unsigned idx)
00109 {
00110  gStorage* inObj = coordL.GetObjectPtr( idx );
00111  gIntCoord* pairPtr = (gIntCoord*)inObj;
00112  ASSERTION(pairPtr!=nil,"pairPtr!=nil");
00113  return *pairPtr;
00114 }
00115 
00116 unsigned sCoordText::FindCoordX (int x)
00117 {
00118  gList foundL, foundTxtL;
00119  return FindCoordX( x, foundL, foundTxtL );
00120 }
00121 
00122 unsigned sCoordText::FindCoordX (int x, gList& foundL, gList& foundTxtL)
00123 {
00124  unsigned idx, firstIdx=0, n = N();
00125  int thisX, y, z;
00126 
00127  for (idx=1; idx<=n; idx++) {
00128      thisX = GetCoord( idx ).GetX();
00129      if ( thisX==x ) {
00130          if ( firstIdx==0 ) firstIdx = idx;
00131          y = GetCoord( idx ).GetY();
00132          z = GetCoord( idx ).GetZ();
00133          AddCoordToList( x, y, z, foundL );
00134          foundTxtL.Add( textL.Str( idx ) );
00135      }
00136  }
00137  return firstIdx;
00138 }
00139 
00140 bool sCoordText::Add (int x, int y, char* s)
00141 {
00142  ASSERTION(s!=nil,"s!=nil");
00143  gString aS( s );
00144  return Add( x, y, 0, aS );
00145 }
00146 
00147 bool sCoordText::Add (int x, int y, int z, gString& s)
00148 {
00149  gString sTrim( s );
00150  sTrim.Trim();
00151  if ( sTrim.IsEmpty() ) return false;
00152 
00153 #ifdef DEBUG
00154  if ( z==0 ) {
00155      printf("DBG:AddTxt(%d,%d):'%s' as '%s'\n",x,y,s.Str(),sTrim.Str());
00156  }
00157  else {
00158      printf("DBG:AddCmd(%d,%d):%s!\n",x,y,s.Str());
00159  }
00160 #endif //DEBUG
00161 
00162  AddCoord( x, y, z );
00163  AddText( sTrim.Str() );
00164  return true;
00165 }
00166 
00167 bool sCoordText::Add (int x, int y, int z, char* s)
00168 {
00169  ASSERTION(s!=nil,"s!=nil");
00170  gString aS( s );
00171  return Add( x, y, z, aS );
00172 }
00173 
00174 bool sCoordText::AddCoord (int x, int y, int z)
00175 {
00176  return AddCoordToList( x, y, z, coordL );
00177 }
00178 
00179 bool sCoordText::AddCoordToList (int x, int y, int z, gList& resL)
00180 {
00181  bool isOk;
00182  gIntCoord* newObj = new gIntCoord( x, y, z );
00183  ASSERTION(newObj!=nil,"newObj!=nil");
00184  isOk = IsOk();
00185  resL.AppendObject( newObj );
00186  return isOk;
00187 }
00188 
00189 bool sCoordText::AddText (gString& s)
00190 {
00191  return AddText( s.Str() );
00192 }
00193 
00194 bool sCoordText::AddText (char* s)
00195 {
00196  if ( s==nil ) return false;
00197  textL.Add( s );
00198  return true;
00199 }
00200 
00201 void sCoordText::Delete ()
00202 {
00203  coordL.Delete();
00204  textL.Delete();
00205 }
00206 
00207 void sCoordText::Report (FILE* fRep, bool doShowAll)
00208 {
00209  unsigned i, n = N();
00210 
00211  if ( fRep==nil ) fRep = stdout;
00212  for (i=1; i<=n; i++) {
00213      fprintf(fRep,"[%s%d,%s%d]:%s\n",
00214              doShowAll?"Line:":"\0",
00215              GetCoord( i ).GetX(),
00216              doShowAll?"Col:":":",
00217              GetCoord( i ).GetY(),
00218              Str( i ));
00219  }
00220 }
00221 ////////////////////////////////////////////////////////////
00222 gUnweb::gUnweb (char* fName, bool doVerbose)
00223     : gFileFetch( fName, -1, doVerbose ),
00224       firstHtmlError( 0 ),
00225       firstHtmlErrLNr( 0 ),
00226       lastHtmlError( 0 ),
00227       lastHtmlErrLNr( 0 ),
00228       nHtmlErrors( 0 ),
00229       showKind( e_ShowAll ),
00230       isVerbose( doVerbose ),
00231       tagCaseCh( e_CaseUnchange ),
00232       scriptLevel( -9 ),
00233       cCHR_HTM_PARSED( cCHR_HTM_PARSED_DEFAULT ),
00234       cCHR_HTM_INFO( cCHR_HTM_INFO_DEFAULT )
00235 {
00236  // gFileFetch calls SetDeviceReport( e_fStderr ) whenever doVerbose=True
00237  SetChrHtmParsed( cCHR_HTM_PARSED );
00238 }
00239 
00240 gUnweb::gUnweb (gString& sInput, bool doVerbose)
00241     : gFileFetch( sInput, doVerbose ),
00242       firstHtmlError( 0 ),
00243       firstHtmlErrLNr( 0 ),
00244       lastHtmlError( 0 ),
00245       lastHtmlErrLNr( 0 ),
00246       nHtmlErrors( 0 ),
00247       showKind( e_ShowAll ),
00248       isVerbose( doVerbose ),
00249       tagCaseCh( e_CaseUnchange ),
00250       scriptLevel( -9 ),
00251       cCHR_HTM_PARSED( cCHR_HTM_PARSED_DEFAULT ),
00252       cCHR_HTM_INFO( cCHR_HTM_INFO_DEFAULT )
00253 {
00254  SetChrHtmParsed( cCHR_HTM_PARSED );
00255 }
00256 
00257 gUnweb::~gUnweb ()
00258 {
00259 }
00260 
00261 FILE* gUnweb::VerboseStream ()
00262 {
00263  if ( isVerbose==false ) return nil;
00264  ASSERTION(fVRepErr!=nil,"fVRepErr!=nil");
00265  return fVRepErr;
00266 }
00267 
00268 bool gUnweb::IsBufferOk ()
00269 {
00270  return IsOpened() && gFileFetch::IsBufferOk();
00271 }
00272 
00273 char* gUnweb::Str (unsigned idx)
00274 {
00275  return
00276      HasProcessed() ? coordSerial.Str( idx ) : aL.Str( idx );
00277 }
00278 
00279 void gUnweb::SetChrHtmParsed (t_uchar c)
00280 {
00281  cCHR_HTM_PARSED = c;
00282  sCHR_HTM_PARSED[0] = (char)c;
00283  sCHR_HTM_PARSED[1] = 0;
00284 }
00285 
00286 unsigned gUnweb::FindStringKey (gString& s, char* sub, unsigned startPos, unsigned endPos)
00287 {
00288  unsigned k, uLen = s.Length();
00289  short quoting = 0;
00290  t_uchar uChr;
00291  char* str;
00292 
00293  ASSERTION(sub!=nil,"sub!=nil");
00294  if ( uLen==0 ) return 0;
00295 
00296  ASSERTION(startPos>0,"startPos>0");
00297  ASSERTION(endPos>0,"endPos>0");
00298  ASSERTION(endPos<=uLen,"endPos<=uLen");
00299 
00300  if ( sub[0]==0 ) return 0;  // Nothing to find
00301 
00302  for (k=startPos, str=s.Str()+k-1; k<=endPos; k++, str++) {
00303      uChr = s[k];
00304      if ( uChr=='"' ) {
00305          quoting = quoting==0;
00306      }
00307      if ( quoting!=0 ) continue;
00308      if ( gStrControl::Self().Find( str, sub )==1 ) {
00309          return k;
00310      }
00311  }
00312  return 0;
00313 }
00314 
00315 unsigned gUnweb::FindStringKey (gString& s, char* sub)
00316 {
00317  return FindStringKey( s, sub, 1, s.Length() );
00318 }
00319 
00320 unsigned gUnweb::FindStringKey (char* s, char* sub, unsigned startPos, unsigned endPos)
00321 {
00322  ASSERTION(s!=nil,"s!=nil");
00323  gString aS( s );
00324  return FindStringKey( aS, sub, startPos, endPos );
00325 }
00326 
00327 unsigned gUnweb::FindStringKey (char* s, char* sub)
00328 {
00329  ASSERTION(s!=nil,"s!=nil");
00330  gString aS( s );
00331  return FindStringKey( aS, sub );
00332 }
00333 
00334 int gUnweb::Dump (FILE* fRepErr)
00335 {
00336  int error;
00337  gList headL;
00338 
00339  error = thisPreProcess( fRepErr, aL, headL );
00340  if ( error<0 ) return 0;  // Not re-parsing...just return.
00341  if ( error!=0 ) {
00342      if ( isVerbose ) fprintf(VerboseStream(),"Error pre-processing HTML: error-code: %d.\n",error);
00343      return error;
00344  }
00345 
00346  gList oL;
00347  error = thisPostProcess( fRepErr, headL, stripL, oL, coordSerial );
00348  if ( error!=0 ) return error;
00349 
00350  DBGPRINT_WEB("thisPostProcess: DONE\n");
00351 
00352 #ifdef DEBUG_WEB
00353  unsigned ix;
00354  for (ix=1; ix<=coordSerial.N(); ix++) {
00355      unsigned eX=coordSerial.GetCoord( ix ).GetX();
00356      printf("WEB: ix=%u (%u): %s\n",ix,eX,Str(ix));
00357  }
00358  // One bug found in POST-PROCESS: </SCRIPT> tags disapear if xyz<SCRIPT>abc</SCRIPT> exists (same line).
00359  // Corrected, ok now.
00360 #endif //DEBUG_WEB
00361 
00362  // At this point, 'coordSerial' contains the serialized HTML, i.e.,
00363  // one line per command or text.
00364 
00365  thisConsolidateData( stripL, 1, stripL.N(), 2 );
00366 
00367  if ( isVerbose ) fprintf(VerboseStream(),"gUnweb::Post-Consolidate (%u lines)\n",coordSerial.N());
00368 
00369  thisConsolidateDataCoord( coordParseErr, coordSerial );
00370 
00371  if ( isVerbose ) fprintf(VerboseStream(),"gUnweb::Post-Consolidate-check (%u error(s))\n",coordParseErr.N());
00372 
00373 #ifdef DEBUG
00374  if ( 1 ) {
00375      printf("DBG:{\n");
00376      for (short dbgIter=1, dbgAll=1; dbgIter<=(short)coordSerial.N(); dbgIter++) {
00377          printf("[%d,%d:%3d]:%s%s%s\n",
00378                 coordSerial.GetCoord( dbgIter ).GetX(),
00379                 coordSerial.GetCoord( dbgIter ).GetY(),
00380                 coordSerial.GetCoord( dbgIter ).GetZ(),
00381                 dbgAll?"'":"\0",
00382                 coordSerial.Str( dbgIter ),
00383                 dbgAll?"'":"\0");
00384      }
00385      printf("DBG:}\n");
00386      if ( coordParseErr.N()>0 ) {
00387          printf("DBG:TXT-ERRORS:START:");
00388          coordParseErr.Report(stdout,true);
00389          printf("DBG:TXT-ERRORS:END.\n");
00390      }
00391  }
00392 #endif //DEBUG
00393 
00394  thisCutCommentsInTxt( coordComments, coordSerial );
00395 
00396  return nHtmlErrors>0;
00397 }
00398 
00399 bool gUnweb::SetVerboseStream (FILE* aFRepErr)
00400 {
00401  fVRepErr = aFRepErr;
00402  isVerbose = fVRepErr!=nil;
00403  return isVerbose;
00404 }
00405 
00406 int gUnweb::thisConsolidateData (gList& ioL, unsigned lowL, unsigned highL, short step)
00407 {
00408  unsigned i;
00409  unsigned kIter, kCode;
00410  unsigned kLen;
00411  char* str;
00412 
00413  for (i=lowL; i<=highL; i++) {
00414      str = ioL.Str( i );
00415      kLen = (unsigned)strlen( str );
00416      for (kIter=0; kIter<kLen; kIter++) {
00417          kCode = (t_uchar)str[kIter];
00418          switch ( step ) {
00419          case 1:
00420          case 2:
00421              if ( kCode==cCHR_HTM_PARSED )
00422                  str[kIter] = ' '; // a blank or cCHR_HTM_INFO
00423              break;
00424          default:
00425              break;
00426          }
00427      }
00428  }
00429 
00430  switch ( step ) {
00431  case 2:  //Use 1 or 2 for debug (1st or 2nd parse-step)
00432      // Under normal input and post-processing, this check is not useful!
00433 #ifdef DEBUG
00434      for (i=lowL; i<=highL; i++) {
00435          gString sTemp( ioL.Str( i ) );
00436          sTemp.TrimRight();
00437          if ( sTemp.Length()>0 ) printf("DBG %3d:@@ %s\n",i,sTemp.Str());
00438      }
00439 #endif //DEBUG
00440      break;
00441  default:
00442      break;
00443  }
00444  return 0;
00445 }
00446 
00447 int gUnweb::thisConsolidateDataCoord (sCoordText& cErr, sCoordText& coord)
00448 {
00449  unsigned i, k, n=coord.N();
00450  unsigned kLen;
00451  unsigned pos, posEnd;
00452  unsigned x, y;
00453  char* str;
00454  short levelScript=0;
00455  bool hasSomeScript;
00456 
00457  for (i=1; i<=n; i++) {
00458      short quoting = 0, quoting2 = 0;
00459      short notOkCount = 0, doesEnd = 0;
00460      t_uchar uChr;
00461      gString sCmd( str = coord.Str( i ) );
00462      gString sCmdUp( str );
00463      sCmdUp.UpString();
00464      x = coord.GetCoord(i).GetX();
00465      y = coord.GetCoord(i).GetY();
00466 
00467      DBGPRINT_WEB("WEB: DataCoord(%u,%u): %s\n",x,y,str);
00468      // Scripting skip
00469      hasSomeScript = false;
00470      if ( (pos = sCmdUp.Find("<SCRIPT"))>0 ) {
00471          if ( levelScript!=0 ) {
00472              cErr.Add( x, y+pos-1, 126, "Script-Script tag, invalid nesting" ); //HTML:ERR
00473          }
00474          levelScript = 1;
00475          hasSomeScript = true;
00476      }
00477      posEnd = sCmdUp.Find("</SCRIPT");
00478      if ( posEnd>pos ) {
00479          levelScript--;
00480          hasSomeScript = true;
00481      }
00482      if ( levelScript<0 ) {
00483          cErr.Add( x, y+posEnd-1, 127, "EndScript-EndScript tag, invalid nesting" ); //HTML:ERR
00484          levelScript = 0;
00485      }
00486      hasSomeScript = hasSomeScript==true || levelScript>0;
00487      ///printf("DBG:%s:%d:[lvl%d]: [%s]\n",hasSomeScript?"SCR":"nor",x,levelScript,str);
00488      if ( hasSomeScript ) continue;  // Avoid script checking
00489 
00490      if ( sCmd[1]!='<' ) continue;  // Not a command
00491 
00492      kLen = sCmd.Length();
00493      uChr = sCmdUp[2];
00494      doesEnd = uChr=='/';
00495      if ( ( (uChr>='A' && uChr<='Z') || uChr=='!' || uChr=='?' || doesEnd!=0 )==false ) {
00496          if ( kLen>2 )
00497              cErr.Add( x, y+1, 123, "Strictly not a tag." ); //HTML:ERR
00498          else
00499              cErr.Add( x, y+1, 133, "Strictly not a tag (empty)." ); //HTML:ERR
00500          notOkCount++;
00501      }
00502 
00503      if ( sCmd[ kLen ]!='>' ) {
00504          // Weird command, add it to 'cErr'
00505          cErr.Add( x, y, 116, "Unterminated tag." ); //HTML:ERR
00506          continue;
00507      }
00508 
00509      if ( notOkCount==0 ) {
00510          // Check remaining tag validity
00511          gString sCmdRem;
00512          sCmdRem.CopyFromTo( sCmdUp, 2+doesEnd, kLen );
00513          k = sCmdRem.FindExcept("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!?");  // <?xml> is a valid tag!
00514          ASSERTION(k>0,"k>0"); //At least will find '>'
00515          uChr = sCmdRem[k];
00516          k += (unsigned)doesEnd;
00517          if ( k+1<kLen ) {
00518              if ( uChr!=' ' && uChr!='=' ) {
00519                  cErr.Add( x, y+k, 124, "Not strictly valid character(s) in tag." ); //HTML:ERR
00520                  notOkCount++;
00521              }
00522              else {
00523                  if ( doesEnd!=0 ) {
00524                      cErr.Add( x, y+k, 125, "Not strictly valid character(s) in end-tag." ); //HTML:ERR
00525                      notOkCount++;
00526                  }
00527              }
00528          }
00529          // Only the first word was checked here, for simplicity.
00530      }
00531 
00532      for (k=2; k<=kLen && (uChr = sCmd[ k ])!=0; k++) {
00533          switch ( uChr ) {
00534          case '"':
00535              if ( quoting2==0 ) quoting = (quoting==0)*((short)k);
00536              break;
00537          case '\'':
00538              if ( quoting==0 ) quoting2 = (quoting2==0)*((short)k);
00539              break;
00540          default:
00541              if ( quoting!=0 || quoting2!=0 ) continue;
00542              // If there is an error, add-it to 'cErr'
00543              if ( uChr>=127 ) {
00544                  // Limited charset allowed for commands.
00545                  // 127..255 is non-7bit.
00546                  // To be fully strict, DTD compliance, tilde (126: '~') would not be allowed.
00547                  cErr.Add( x, y+k-1, 115, "Non 7bit symbol" ); //HTML:ERR
00548                  continue;
00549              }
00550              ; // Now check '<' cannot appear twice
00551              if ( uChr=='<' ) {
00552                  cErr.Add( x, y+k-1, 121, "Too many '<'" ); //HTML:ERR
00553                  continue;
00554              }
00555              if ( k<kLen && uChr=='>' ) {
00556                  cErr.Add( x, y+k-1, 122, "Too many '>'" ); //HTML:ERR
00557                  continue;
00558              }
00559              //
00560              // Now beautify commands (except between quotes)
00561              //
00562              switch ( tagCaseCh ) {
00563              case e_CaseUnchange:
00564                  break;
00565              case e_CaseConvUpper:
00566                  if ( uChr>='a' && uChr<='z' ) {
00567                      uChr -= 32;
00568                  }
00569                  str[ k-1 ] = uChr;
00570                  break;
00571              case e_CaseConvDown:
00572                  break; //Todo.
00573              default:
00574                  break;
00575              }//end CASE (tagCaseCh)
00576              break;
00577          }//end CASE (uChr)
00578      }//end FOR k
00579      if ( quoting!=0 ) {
00580          cErr.Add( x, y+quoting-1, 117, "Unterminated double-quote" ); //HTML:ERR
00581      }
00582      if ( quoting2!=0 ) {
00583          cErr.Add( x, y+quoting-1, 118, "Unterminated single-quote" ); //HTML:ERR
00584      }
00585  }//end FOR i
00586 
00587  // Update nErrorsXXX data-members
00588  n = cErr.N();
00589  nHtmlErrors += n;
00590  if ( n>0 ) {
00591      firstHtmlErrLNr = cErr.GetCoord( 1 ).GetX();
00592      lastHtmlErrLNr = cErr.GetCoord( n ).GetX();
00593  }
00594  return 0;
00595 }
00596 
00597 int gUnweb::thisCutCommentsInTxt (sCoordText& coordCmt, sCoordText& zRes)
00598 {
00599  unsigned i, nPost = zRes.N();
00600  unsigned idx;
00601  int iLen;
00602  int x, y, z;
00603  int y1;
00604  char* str;
00605 
00606  if ( coordCmt.N()==0 ) return 0;  // Nothing to do
00607 
00608  for (i=1; i<=nPost; i++) {
00609      z = zRes.GetCoord( i ).GetZ();
00610      if ( z!=0 ) continue; // Not a command/tag, ignore
00611      gList foundL, foundTxtL;
00612      x = zRes.GetCoord( i ).GetX();
00613      idx = coordCmt.FindCoordX( x, foundL, foundTxtL );
00614      if ( idx==0 ) continue; // No comment in line x
00615      y = zRes.GetCoord( i ).GetY();
00616      unsigned ilIdx, ilN;
00617      int strIter, cmtIter, cmtIterMax;
00618      str = zRes.Str( i );
00619      iLen = strlen(str);
00620      for (ilIdx=1, ilN=foundL.N(); ilIdx<=ilN; ilIdx++) {
00621          gIntCoord* pCoord;
00622          pCoord = (gIntCoord*)foundL.GetObjectPtr( ilIdx );
00623          y1 = pCoord->GetY();
00624          if ( y1 < y+iLen ) {
00625              cmtIterMax = strlen( foundTxtL.Str( ilIdx ) );
00626              strIter = y1-y;
00627              if ( strIter<0 ) continue;  // Comment at begining of line, already suppressed.
00628              for (cmtIter=0; cmtIter<cmtIterMax; cmtIter++) {
00629                  ASSERTION(strIter<iLen,"strIter...");
00630                  str[ strIter++ ] = cCHR_HTM_PARSED;;
00631                  //fprintf(stderr,"DBG:STR=%s|y=%d,y1=%d (ilIdx=%d, strIter=%d)\n",str,y,y1,ilIdx,strIter);
00632              }
00633          }
00634      }//end FOR ilIdx
00635      gString sTemp;
00636      for (strIter=0; strIter<iLen; strIter++) {
00637          t_uchar uChr( (t_uchar)str[ strIter ] );
00638          if ( uChr!=cCHR_HTM_PARSED ) sTemp.Add( uChr );
00639      }
00640      strcpy( str, sTemp.Str() );
00641  }//end FOR i (line loop)
00642  return 0;
00643 }
00644 
00645 int gUnweb::thisPreProcess (FILE* fRepErr, gList& iL, gList& oL)
00646 {
00647  ;
00648  // Pre-processes, if not already
00649  // Returns -1 if already processed;
00650  //         0 if all ok
00651  //         >0 for an error parsing HTML
00652  //
00653  int error;
00654  int len;
00655  unsigned i, n;
00656  char* str;
00657 
00658  if ( HasProcessed()==false ) return -1;
00659 
00660  // cCHR_HTM_PARSED is never changed.
00661  firstHtmlError = firstHtmlErrLNr = 0;
00662  lastHtmlError = lastHtmlErrLNr = 0;
00663  nHtmlErrors = 0;
00664 
00665  // Clean-up output lst
00666  oL.Delete();
00667 
00668  // Build stripL the first time
00669  error = 0;
00670  for (i=1, n=iL.N(); i<=n; i++) {
00671      str = iL.Str( i );
00672      // Check tailing '\r', and cut it!
00673      while ( (len = strlen(str))>0 && str[--len]=='\r' ) str[len] = 0;
00674      // Check if used HTM_PARSED character will collide with existing input
00675      if ( gStrControl::Self().Find( str, cCHR_HTM_PARSED )>0 ) {
00676          fprintf(fRepErr,"Found chr 0x%02X on line: %u\n",cCHR_HTM_PARSED,i);
00677          error++;
00678      }
00679      if ( gStrControl::Self().Find( str, '\r' )>0 ) {
00680          fprintf(fRepErr,"Found chr 0x%02X on line: %u\n",'\r',i);
00681          error++;
00682      }
00683      gString sTrim( str );
00684      sTrim.TrimRight();
00685      stripL.Add( sTrim );
00686  }
00687 
00688  if ( error!=0 ) return 1;
00689 
00690  if ( isVerbose ) fprintf(VerboseStream(),"gUnweb:PreJoin (%u lines)\n",n);
00691 
00692  error = thisPreJoin( fRepErr, iL, stripL, oL );
00693  if ( error!=0 ) return error;
00694 
00695  // Just checking iL cardinality is the same as stripL
00696  ASSERTION(iL.N()==stripL.N(),"iL.N()==stripL.N()");
00697 
00698  // Now we want to knock-out comments,
00699  // and document-it at 'coordComments'
00700  error = thisPreStripComments( fRepErr, stripL, coordComments );
00701  if ( error!=0 ) return error;
00702 
00703  if ( isVerbose ) fprintf(VerboseStream(),"gUnweb:StripComments (%u lines)\n",coordComments.N());
00704 
00705  // So far the hardest parsing part is done.
00706 
00707  // Now consolidate data...
00708  thisConsolidateData( stripL, 1, n, 1 );
00709 
00710  if ( isVerbose ) fprintf(VerboseStream(),"gUnweb:Post\n");
00711 
00712  return error;
00713 }
00714 
00715 int gUnweb::thisPreJoin (FILE* fRepErr, gList& iL, gList& sL, gList& oL)
00716 {
00717  unsigned i, n;
00718  unsigned uLen, pos, pos2;
00719  unsigned nOcc = 0;
00720  char* stripStr;
00721 
00722  // Step 1:
00723  // * Find '<!DOCTYPE', i.e. '<!' and join-in
00724 
00725  for (i=1, n=iL.N(); i<=n; i++) {
00726      stripStr = sL.Str( i );
00727      gString s( stripStr );
00728      uLen = s.Length();
00729 
00730      while ( (pos = s.Find("<!DOCTYPE ",nOcc,false))>0 ) {
00731          // Pass begining of string (if any chrs before '<!') to oL: thisStrMove( s, 1, pos-1, oL )
00732 
00733          // Find '>' on this line, or in the remaining lines
00734          gString sEnd;
00735          sEnd.CopyFromTo( s, pos, uLen );
00736          pos2 = sEnd.Find( ">" );
00737          if ( pos2>0 ) {
00738              pos2 += pos - 1;
00739              // Explanation follows for the first usage of 'thisStrMove' method:
00740              // - Found XXX<!DOCTYPE...YYY>*
00741              // - Replace it by XXX@@@@@*
00742              //   where @ here referred represents CHR_HTM_PARSED (inverted excl.mark)
00743              thisStrMove( s, pos, pos2, oL );
00744              // Note: 'oL' has just has the parsed '<!DOCTYPE...>' instructions.
00745 
00746              // Next sequence:
00747              // 'thisStripReplace' copies the current changed string (with @s)
00748              // to the stripStr memory location (no fancy copy here!)
00749              // Basically this allows to keep tracking of the characters
00750              // already parsed from the original text, in the exact (line,col) locations.
00751              thisStripReplace( s, stripStr );
00752              continue;
00753          }
00754          // line extends beyond: <!xxx ... \n ...> ?
00755          // => '<!DOCTYPE' beyond newline
00756          //
00757          // First step is to change the current line XXX<!DOCTYPE...YYY
00758          // to:  XXX@@@
00759          gString sCat, sLimp;
00760          thisStrSetParse( s, pos, uLen, sCat );
00761          thisStripReplace( s, stripStr );
00762          for (i=i+1, pos=0; i<=n && pos==0; i++) {
00763              stripStr = sL.Str( i );
00764              s = stripStr;
00765              uLen = s.Length();
00766              if ( uLen==0 ) continue;
00767              sCat.Add( " " );
00768              pos = s.Find( ">" );
00769              if ( pos==0 ) {
00770                  gString sTrim( s );
00771                  sTrim.Trim();
00772                  sCat.AddString( sTrim );
00773                  thisStrSetParse( s, pos, uLen, sLimp );
00774                  thisStripReplace( s, stripStr );
00775              }
00776          }//end FOR inner-i
00777          if ( pos==0 ) return 101;  // HTML:ERR: DocType not ended
00778          // ...YYY>ZZZ =>sLimp-becomes=> YYY>
00779          thisStrSetParse( s, 1, pos, sLimp );
00780          // ...now stripStr becomes: @@@@ZZZ
00781          thisStripReplace( s, stripStr );
00782          // ...and sCat must contain YYY>, e.g.: <!DOCTYPE AAA\nBBB\nYYY>
00783          sCat.AddString( sLimp );
00784          // Finally adding the command to 'oL'
00785          thisStrMove( sCat, 1, sCat.Length(), oL );
00786      }
00787  }//end FOR (i)
00788 
00789  return 0;
00790 }
00791 
00792 int gUnweb::thisPreStripComments (FILE* fRepErr, gList& iL, sCoordText& zRes)
00793 {
00794  unsigned i, n;
00795  unsigned k;
00796  unsigned uLen, pos, pos2;
00797  char* stripStr;
00798 
00799  // Note: rewrites 'iL' strings
00800 
00801  for (i=1, n=iL.N(); i<=n; ) {
00802      stripStr = iL.Str( i );
00803      uLen = (unsigned)strlen( stripStr );
00804      pos = gStrControl::Self().Find( stripStr, "<!--" );
00805      if ( pos==0 ) {
00806          i++;
00807          continue;
00808      }
00809      zRes.AddCoord( i, pos );
00810      // 'sCom' will get bigger for the comments in, including '\n'
00811      gString sCom;
00812      // 0-based index for stripStr, thus decrement posXXX
00813      pos--;
00814      // One or more occurrences of a comment here.
00815      // The comment may end in this line or below.
00816      pos2 = gStrControl::Self().Find( stripStr+pos, "-->" );
00817      if ( pos2>0 ) {
00818          pos2--;
00819          pos2 += pos;
00820          // The comment ends at this line
00821          for (k=pos; k<pos2+3; k++) {
00822              // ...why pos2+3? Because '-->' has 3 chars.
00823              sCom.Add( stripStr[k] );
00824              stripStr[k] = cCHR_HTM_PARSED;
00825          }
00826          zRes.AddText( sCom );
00827          // ...and try again the same line
00828          continue;
00829      }
00830      // Here we know there was a '<!--',
00831      // ...knock-out comments here
00832      for (k=pos; k<uLen; k++) {
00833          sCom.Add( stripStr[k] );
00834          stripStr[k] = cCHR_HTM_PARSED;
00835      }
00836      sCom.Add( '\n' );
00837      // ...and find in the lines below: '-->'
00838      i++;
00839      for (pos2=0; i<=n && pos2==0; i++) {
00840          stripStr = iL.Str( i );
00841          uLen = (unsigned)strlen( stripStr );
00842          pos2 = gStrControl::Self().Find( stripStr, "-->" );
00843          if ( pos2==0 ) {
00844              sCom.Add( stripStr );
00845              sCom.Add( '\n' );
00846              for (k=0; k<uLen; k++) stripStr[k] = cCHR_HTM_PARSED;
00847              continue;
00848          }
00849          for (k=0; k+1<pos2+3; k++) {
00850              // ...why pos2+3? Because '-->' has 3 chars.
00851              sCom.Add( stripStr[k] );
00852              stripStr[k] = cCHR_HTM_PARSED;
00853          }
00854      }// end FOR inner-i...
00855      zRes.AddText( sCom );
00856      if ( pos2==0 ) return 102;  // HTML:ERR: Comment did not end
00857  }
00858  return 0;
00859 }
00860 
00861 int gUnweb::thisStrMove (gString& s, unsigned startPos, unsigned endPos, gList& oL)
00862 {
00863  unsigned x;
00864  int count=0;
00865  gString sOut;
00866  bool anyScript=false;
00867 
00868  sOut.CopyFromTo( s, startPos, endPos );
00869  sOut.Trim();
00870  gString sUp( sOut );
00871  sUp.UpString();
00872 
00873  unsigned pos = sUp.Find("<SCRIPT");
00874  if ( (anyScript = (pos>0))==true ) {
00875      scriptLevel++;
00876  }
00877  else {
00878      pos = sUp.Find("</SCRIPT");
00879      scriptLevel -= pos>0;
00880      if ( scriptLevel<0 ) scriptLevel = 0;  // No error report, here.
00881      anyScript = scriptLevel>0;
00882  }
00883 
00884  //printf("DBG:IN_:'%s' (scriptLevel=%d)\n",sOut.Str(),scriptLevel);
00885 
00886  // Fill-up string 's' with HTM_PARSED chars
00887  for (x=startPos; x<=endPos; x++, count++) s[x] = cCHR_HTM_PARSED;
00888 
00889  //if ( anyScript ) printf("DBG:OUT:'%s'\n",sOut.Str());
00890  //printf("DBG:OUT:'%s'\nP:%s\n\n",s.Str(),sOut.Str());
00891 
00892  if ( sOut.Length()==0 ) return 0;
00893  oL.Add( sOut );
00894 
00895  return count;
00896 }
00897 
00898 int gUnweb::thisStripReplace (gString& s, char* resStr)
00899 {
00900  unsigned uLen = s.Length();
00901  unsigned rLen;
00902 
00903  if ( resStr==nil ) return -1;
00904  rLen = (unsigned)strlen( resStr );
00905  ASSERTION(uLen==rLen,"uLen==rLen");
00906  strcpy( resStr, s.Str() );
00907  return 0;
00908 }
00909 
00910 int gUnweb::thisStrSetParse (gString& s, unsigned startPos, unsigned endPos, gString& sResult)
00911 {
00912  gList tempL;
00913  ASSERTION(endPos>0,"endPos>0");
00914  if ( thisStrMove( s, startPos, endPos, tempL )==0 ) return 0;
00915  sResult = tempL.Str( 1 );
00916  return 1;
00917 }
00918 
00919 ////////////////////////////////////////////////////////////
00920 // continuing... gUnweb post-processing
00921 ////////////////////////////////////////////////////////////
00922 int gUnweb::thisPostProcess (FILE* fRepErr, gList& headL, gList& iL, gList& oL, sCoordText& coord)
00923 {
00924  unsigned i, n;
00925  unsigned uLen, pos, pos2;
00926  unsigned posStart;
00927  char* stripStr;
00928  gString sLimp;
00929 
00930  ASSERTION(coord.N()==0,"coord.N()==0");
00931  scriptLevel = 0;  // Nesting level of <SCRIPT...> keywords.
00932 
00933  // Fancy adjust of output
00934  for (i=1, n=headL.N(); i<=n; i++) {
00935      oL.Add( stripStr = headL.Str( i ) );
00936      thisAddCmd( 1, 1, stripStr, coord );
00937  }
00938  // .
00939 
00940  // Serialize keywords and text found (into 'coord'; iL rewritten)
00941 
00942  for (i=1, n=iL.N(), posStart=1; i<=n; ) {
00943      stripStr = iL.Str( i );
00944      gString s( stripStr );
00945      // stripStr is already trimmed
00946      uLen = s.Length();
00947      if ( posStart>uLen ) {
00948          i++; posStart = 1; // Parse next line
00949          continue;
00950      }
00951 
00952      ASSERTION(posStart>0,"posStart>0");
00953      pos = FindStringKey( s, "<", posStart, uLen );
00954      if ( pos==0 ) {
00955          thisAddTxt( i, posStart, stripStr+posStart-1, coord );
00956          thisStrMove( s, posStart, uLen, oL );
00957          thisStripReplace( s, stripStr );
00958          i++; posStart = 1;  // Parse next line
00959          continue;
00960      }
00961 
00962      ASSERTION(pos>=posStart,"pos>=posStart");
00963 
00964      // Strip before keyword, e.g.: XXX<BODY>YYY becomes @@@<BODY>YYY
00965      if ( pos>posStart ) {
00966          gString sTemp;
00967          unsigned aKeepPos;
00968          if ( scriptLevel>0 ) {
00969              DBGPRINT_WEB("WEB: Post: XXX<tag>YYY [%s] scriptLevel=%d pos=%u,posStart=%u,len=%d\n",stripStr,scriptLevel,pos,posStart,strlen(stripStr));
00970              // Check if something like
00971              // "@@@<SCRIPT>"+"xyz"+!"</SCRIPT>"
00972              char* strTagEtc = stripStr+pos-1;
00973              if ( gStrControl::Self().Find( strTagEtc, "</SCRIPT", true )==1 ) {
00974                  sTemp.CopyFromTo( s, posStart, pos+gStrControl::Self().Find( strTagEtc, ">" ) );
00975                  DBGPRINT_WEB("WEB: Post: scriptLevel_now=%d [strTagEtc=%s] [%s]\n",scriptLevel,strTagEtc,sTemp.Str());
00976                  pos--;
00977                  thisAddTxt( i, posStart, sTemp, coord );
00978                  thisStrMove( s, posStart, pos, oL );
00979                  thisStripReplace( s, stripStr );
00980                  DBGPRINT_WEB("WEB: Post: after1 <tag>YYY [%s] scriptLevel=%d pos=%u,posStart=%u\n",stripStr,scriptLevel,pos,posStart);
00981                  posStart = pos+1;
00982                  continue;
00983              }
00984              i++; posStart = 1;  // SCR_IGN
00985              continue;
00986          }
00987          aKeepPos = pos;
00988          pos--;
00989          sTemp.CopyFromTo( s, posStart, pos );
00990          thisAddTxt( i, posStart, sTemp, coord );
00991          thisStrMove( s, posStart, pos, oL );
00992          thisStripReplace( s, stripStr );
00993          posStart = aKeepPos;  // Keep seeking, in the above e.g.: <BODY>YYY
00994          continue;
00995      }
00996 
00997      //
00998      // At least one keyword: either all in this line or below, e.g.:
00999      //    <BODY bgcolor="#FFEECC">xyz</HEAD>
01000      // or <BODY bgcolor="#FFE>ECC">xyz</HEAD> (misleading '<', belongs to string)
01001      //
01002      pos2 = FindStringKey( s, ">", pos+1, uLen );
01003      DBGPRINT("FIND(%s):posStart=%d,uLen=%d:POS=%d, POS2=%d\n",stripStr,posStart,uLen,pos,pos2);
01004      if ( pos2>0 ) {
01005          gString sTemp;
01006          ASSERTION(pos2>pos,"pos2>pos");
01007          sTemp.CopyFromTo( s, pos, pos2 );
01008          if ( gStrControl::Self().Find( sTemp.Str(), "</SCRIPT", true ) ||
01009               scriptLevel>0 ) {
01010              DBGPRINT_WEB("DBG:/SCRIPT:sTemp=%s!\n",sTemp.Str());
01011              thisAddCmd( i, pos, sTemp, coord );  // SCR_IGN
01012              thisStrMove( s, pos, pos2, oL );
01013          }
01014          else {
01015              thisAddCmd( i, pos, sTemp, coord );
01016              thisStrMove( s, pos, pos2, oL );
01017              thisStripReplace( s, stripStr );
01018          }
01019          posStart = pos2+1;
01020          continue;
01021      }
01022 
01023      // Something like e.g.: <BODY ... (no ending '>')
01024      ASSERTION(pos2==0,"pos2==0");
01025      gString sCmd;
01026      sCmd.CopyFromTo( s, pos, uLen );
01027 
01028      DBGPRINT("DBG: sCmd=[%s]\n",sCmd.Str());
01029 
01030      int iKeep = i;
01031      // Find in the lines below the trailing '>'
01032      i++; posStart = 1;  // ...next line
01033      for (pos2=0; i<=n && pos2==0; ) {
01034          char* stripStr = iL.Str( i );
01035          pos2 = FindStringKey( stripStr, ">" );
01036          gString s( stripStr );
01037          s.Trim();
01038          if ( s.IsEmpty() ) {
01039              i++;
01040              continue;
01041          }
01042          if ( pos2==0 ) {
01043              sCmd.Add( ' ' );
01044              sCmd.AddString( s );
01045              s.Set( stripStr );
01046              thisStrSetParse( s, 1, s.Length(), sLimp );
01047              thisStripReplace( s, stripStr );
01048              i++;
01049          }
01050      }
01051      if ( pos2==0 ) return 111;  // ...command did not end (till EOF)
01052 
01053      // Update stripStr, not yet striked
01054      thisStrSetParse( s, pos, uLen, sLimp );
01055      thisStripReplace( s, stripStr );
01056      // Update sCmd, which has not the latter line 'i'
01057      s.Set( stripStr = iL.Str( i ) );
01058      sLimp.CopyFromTo( s, 1, pos2 );
01059      sLimp.Trim();
01060      // ...just add a blank if necessary...
01061      if ( FindStringKey( sLimp, ">" )>1 ) sCmd.Add(' ');
01062      sCmd.AddString( sLimp );
01063      // Add command to coord!
01064      thisAddCmd( iKeep, pos, sCmd, coord );
01065      oL.Add( sCmd );
01066      // ...and strike chars before end of command, i.e. before '>'
01067      thisStrSetParse( s, 1, pos2, sLimp );
01068      thisStripReplace( s, stripStr );
01069      // Update indexing vars, and keep parsing the current line
01070      posStart = pos2+1;
01071  }
01072  return 0;
01073 }
01074 
01075 int gUnweb::thisAddTxt (int x, int y, char* s, sCoordText& coord)
01076 {
01077  if ( s==nil ) return -1;
01078  gString aS( s );
01079  return thisAddTxt( x, y, aS, coord );
01080 }
01081 
01082 int gUnweb::thisAddTxt (int x, int y, gString& sTxt, sCoordText& coord)
01083 {
01084  // Return 0 if o.k.
01085  bool isOk;
01086  int iPos = (int)sTxt.FindExcept( " \t" ), iDelta;
01087 
01088  iDelta = ( iPos>0 ? iPos-1 : 0 );
01089  y += iDelta;
01090 
01091  if ( scriptLevel>0 && (showKind & e_ShowScript)==0 ) return 0;
01092  if ( scriptLevel<=0 && (showKind & e_ShowNonscript)==0 ) return 0;
01093 
01094  isOk = coord.Add( x, y, sTxt );
01095  if ( isOk==false ) return -1;
01096  isOk = thisTxtCheck( x, y, sTxt, coordParseErr )==0;
01097  if ( isOk==false ) return -1;
01098  // When using <PRE>, the exact string location must be held
01099  unsigned n = coord.N();
01100  coord.GetCoord(n).iAux = iDelta;
01101  // .
01102  return 0;
01103 }
01104 
01105 int gUnweb::thisAddCmd (int x, int y, char* s, sCoordText& coord)
01106 {
01107  if ( s==nil ) return -1;
01108  gString aS( s );
01109  return thisAddCmd( x, y, aS, coord );
01110 }
01111 
01112 int gUnweb::thisAddCmd (int x, int y, gString& sCmd, sCoordText& coord)
01113 {
01114  int z=-1;
01115  coord.Add( x, y, z, sCmd );
01116  //printf("DBG:CMD:'%s'\n",sCmd.Str());
01117  return z;
01118 }
01119 
01120 int gUnweb::thisTxtCheck (int x, int y, gString& s, sCoordText& cErr)
01121 {
01122  // Return 0 if o.k.
01123  unsigned pos, posCount = 0;
01124 
01125  if ( scriptLevel>0 ) return 0;  // Using scripts, no checking performed
01126  pos = FindStringKey( s, "<" );
01127  posCount += pos;
01128  if ( pos>0 ) {
01129      // At least one invalid symbol found
01130      cErr.Add( x, y+pos-1, 105, "Invalid symbol: '<'" ); //HTML:ERR
01131  }
01132  pos = FindStringKey( s, ">" );
01133  posCount += pos;
01134  if ( pos>0 ) {
01135      // At least one invalid symbol found
01136      cErr.Add( x, y+pos-1, 106, "Invalid symbol: '>'" ); //HTML:ERR
01137  }
01138  return posCount==0 ? 0 : 1;
01139 }
01140 ////////////////////////////////////////////////////////////
01141 

Generated on Sat Aug 18 02:40:59 2007 for xpfweb_v2x lib by  doxygen 1.4.2