diff --git a/include/lgi/common/DocView.h b/include/lgi/common/DocView.h --- a/include/lgi/common/DocView.h +++ b/include/lgi/common/DocView.h @@ -1,524 +1,524 @@ /// \file /// \author Matthew Allen (fret@memecode.com) /// \brief This is the base data and code for all the text controls (inc. HTML) #pragma once #include #include "lgi/common/Variant.h" #include "lgi/common/Notifications.h" #include "lgi/common/Thread.h" #include "lgi/common/Layout.h" // Word wrap enum LDocWrapType { /// No word wrapping TEXTED_WRAP_NONE = 0, /// Dynamically wrap line to editor width TEXTED_WRAP_REFLOW = 1, }; // Util macros /// Returns true if 'c' is whitespace #define IsWhiteSpace(c) ((c) < 126 && strchr(LDocView::WhiteSpace, c) != 0) /// Returns true if 'c' is a delimiter #define IsDelimiter(c) ((c) < 126 && strchr(LDocView::Delimiters, c) != 0) /// Returns true if 'c' is a letter or number #define IsText(c) (IsDigit(c) || IsAlpha(c) || (c) == '_') /// Returns true if 'c' is word boundry #define IsWordBoundry(c) (strchr(LDocView::WhiteSpace, c) || strchr(LDocView::Delimiters, c)) /// Returns true if 'c' is alphanumeric or a digit #define AlphaOrDigit(c) (IsDigit(c) || IsAlpha(c)) /// Returns true if 'c' is a valid URL character #define UrlChar(c) ( \ strchr(LDocView::UrlDelim, (c)) || \ AlphaOrDigit((c)) || \ ((c) >= 256) \ ) /// Returns true if 'c' is email address character #define EmailChar(c) (strchr("._-:+", (c)) || AlphaOrDigit((c))) LgiFunc char16 *ConvertToCrLf(char16 *Text); /// This class contains information about a link. /// \sa LDetectLinks struct LLinkInfo { ssize_t Start; ssize_t Len; bool Email; void Set(ssize_t start, ssize_t len, bool email) { Start = start; Len = len; Email = email; } }; // Call back class to handle viewer events class LDocView; /// An environment class to handle requests from the text view to the outside world. class LgiClass LDocumentEnv : public LThreadOwner { LArray Viewers; public: LDocumentEnv(LDocView *v = 0); virtual ~LDocumentEnv(); enum LoadType { LoadError, LoadNotImpl, LoadImmediate, LoadDeferred, }; struct #ifdef MAC LgiClass #endif LoadJob : public LThreadJob { enum PrefFormat { FmtNone, FmtStream, FmtSurface, FmtFilename, }; enum JobStatus { JobInit, JobOk, JobErr_Uri, JobErr_Path, JobErr_FileOpen, JobErr_GetUri, JobErr_NoCachedFile, JobErr_ImageFilter, JobErr_NoMem, }; // View data LDocumentEnv *Env; void *UserData; uint32_t UserUid; PrefFormat Pref; // Input data LAutoString Uri; LAutoString PostData; // Output data LAutoPtr Stream; LAutoPtr pDC; LString Filename; LString Error; JobStatus Status; LString MimeType, ContentId; LoadJob(LThreadTarget *o) : LThreadJob(o) { Env = NULL; UserUid = 0; UserData = NULL; Pref = FmtNone; Status = JobInit; } LStreamI *GetStream() { if (!Stream && Filename) { LFile *file = new LFile; if (file && file->Open(Filename, O_READ)) Stream.Reset(file); else DeleteObj(file); } return Stream; } }; LoadJob *NewJob() { return new LoadJob(this); } bool AttachView(LDocView *v) { if (!v) return false; if (!Lock(_FL)) return false; LAssert(!Viewers.HasItem(v)); Viewers.Add(v); Unlock(); return true; } bool DetachView(LDocView *v) { if (!v) return false; if (!Lock(_FL)) return false; LAssert(Viewers.HasItem(v)); Viewers.Delete(v); Unlock(); return true; } int NextUid(); /// Creating a context menu, usually when the user right clicks on the /// document. virtual bool AppendItems(LSubMenu *Menu, const char *Param, int Base = 1000) { return false; } /// Do something when the menu items created by LDocumentEnv::AppendItems /// are clicked. virtual bool OnMenu(LDocView *View, int Id, void *Context) { return false; } /// Asks the env to get some data linked from the document, e.g. a css file or an iframe source etc. /// If the GetContent implementation takes ownership of the job pointer then it should set 'j' to NULL. virtual LoadType GetContent(LoadJob *&j) { return LoadNotImpl; } /// After the env's thread loads the resource it calls this to pass it to the doc void OnDone(LAutoPtr j); /// Handle a click on URI virtual bool OnNavigate(LDocView *Parent, const char *Uri) { return false; } /// Handle a form post virtual bool OnPostForm(LDocView *Parent, const char *Uri, const char *Data) { return false; } /// Process dynamic content, returning a dynamically allocated string /// for the result of the executed script. Dynamic content is enclosed /// between <? and ?>. - virtual char *OnDynamicContent(LDocView *Parent, const char *Code) { return 0; } + virtual LString OnDynamicContent(LDocView *Parent, const char *Code) { return NULL; } /// Some script was received, the owner should compile it virtual bool OnCompileScript(LDocView *Parent, char *Script, const char *Language, const char *MimeType) { return false; } /// Some script needs to be executed, the owner should compile it virtual bool OnExecuteScript(LDocView *Parent, char *Script) { return false; } }; /// Default text view environment /// /// This class defines the default behavior of the environment, /// However you will need to instantiate this yourself and call /// SetEnv with your instance. i.e. it's not automatic. class LgiClass LDefaultDocumentEnv : public LDocumentEnv { public: LoadType GetContent(LoadJob *&j); bool OnNavigate(LDocView *Parent, const char *Uri); }; /// Find params class LDocFindReplaceParams { public: virtual ~LDocFindReplaceParams() {} }; /// TextView class is a base for all text controls class LgiClass LDocView : public LLayout, virtual public LDom { friend class LDocumentEnv; protected: LDocumentEnv *Environment = NULL; LString Charset; public: // Static static const char *WhiteSpace; static const char *Delimiters; static const char *UrlDelim; /////////////////////////////////////////////////////////////////////// // Properties #define _TvMenuProp(Type, Name, Default) \ protected: \ Type Name = Default; \ public: \ virtual void Set##Name(Type i) { Name=i; } \ Type Get##Name() { return Name; } _TvMenuProp(uint16, WrapAtCol, 0) _TvMenuProp(bool, UrlDetect, true) _TvMenuProp(bool, ReadOnly, false) _TvMenuProp(LDocWrapType, WrapType, TEXTED_WRAP_REFLOW) _TvMenuProp(uint8_t, TabSize, 4) _TvMenuProp(uint8_t, IndentSize, 4) _TvMenuProp(bool, HardTabs, true) _TvMenuProp(bool, ShowWhiteSpace, false) _TvMenuProp(bool, ObscurePassword, false) _TvMenuProp(bool, CrLf, false) _TvMenuProp(bool, AutoIndent, true) _TvMenuProp(bool, FixedWidthFont, false) _TvMenuProp(bool, LoadImages, false) _TvMenuProp(bool, OverideDocCharset, false) // This UID is used to match data load events with their source document. // Sometimes data will arrive after the document that asked for it has // already been unloaded. So by assigned each document an UID we can check // the job UID against it and discard old data. _TvMenuProp(int, DocumentUid, 0) #undef _TvMenuProp virtual const char *GetCharset() { return Charset.Get() ? Charset.Get() : "utf-8"; } virtual void SetCharset(const char *s) { Charset = s; } virtual const char *GetMimeType() = 0; /////////////////////////////////////////////////////////////////////// // Object LDocView(LDocumentEnv *e = NULL) { SetEnv(e); } virtual ~LDocView() { SetEnv(NULL); } const char *GetClass() { return "LDocView"; } /// Open a file handler virtual bool Open(const char *Name, const char *Cs = 0) { return false; } /// Save a file handler virtual bool Save(const char *Name, const char *Cs = 0) { return false; } /////////////////////////////////////////////////////////////////////// /// Find window handler virtual void DoFind(std::function Callback) { if (Callback) Callback(false); } /// Replace window handler virtual void DoReplace(std::function Callback) { if (Callback) Callback(false); } virtual LDocFindReplaceParams *CreateFindReplaceParams() { return 0; } virtual void SetFindReplaceParams(LDocFindReplaceParams *Params) { } /////////////////////////////////////////////////////////////////////// /// Get the current environment virtual LDocumentEnv *GetEnv() { return Environment; } /// Set the current environment virtual void SetEnv(LDocumentEnv *e) { if (Environment) Environment->DetachView(this); Environment = e; if (Environment) Environment->AttachView(this); } /// When the env has loaded a resource it can pass it to the doc control via this method. /// It MUST be thread safe. Often an environment will call this function directly from /// it's worker thread. virtual void OnContent(LDocumentEnv::LoadJob *Res) {} /////////////////////////////////////////////////////////////////////// // State / Selection /// Set the cursor position, to select an area, move the cursor with Select=false /// then set the other end of the region with Select=true. virtual void SetCaret(size_t i, bool Select, bool ForceFullUpdate = false) {} /// Cursor=false means the other end of the selection if any. The cursor is alwasy /// at one end of the selection. virtual ssize_t GetCaret(bool Cursor = true) { return 0; } /// True if there is a selection virtual bool HasSelection() { return false; } /// Unselect all the text virtual void UnSelectAll() {} /// Select the word from index 'From' virtual void SelectWord(size_t From) {} /// Select all the text in the control virtual void SelectAll() {} /// Get the selection as a dynamicially allocated utf-8 string virtual char *GetSelection() { return 0; } /// Returns the character index at the x,y location virtual ssize_t IndexAt(int x, int y) { return 0; } /// Index=-1 returns the x,y of the cursor, Index >=0 returns the specified x,y virtual bool GetLineColumnAtIndex(LPoint &Pt, ssize_t Index = -1) { return false; } /// True if the document has changed virtual bool IsDirty() { return false; } /// Gets the number of lines of text virtual size_t GetLines() { return 0; } /// Gets the pixels required to display all the text virtual void GetTextExtent(int &x, int &y) {} /////////////////////////////////////////////////////////////////////// /// Cuts the selection from the document and puts it on the clipboard virtual bool Cut() { return false; } /// Copies the selection from the document to the clipboard virtual bool Copy() { return false; } /// Pastes the current contents of the clipboard into the document virtual bool Paste() { return false; } /////////////////////////////////////////////////////////////////////// /// Called when the user hits the escape key virtual void OnEscape(LKey &K) {} /// Called when the user hits the enter key virtual void OnEnter(LKey &k) {} /// Called when the user clicks a URL virtual void OnUrl(char *Url) {} /// Called to add styling to the document virtual void OnAddStyle(const char *MimeType, const char *Styles) {} /////////////////////////////////////////////////////////////////////// struct ContentMedia { LString Id; LString FileName; LString MimeType; LVariant Data; LAutoPtr Stream; bool Valid() { return MimeType.Get() != NULL && FileName.Get() != NULL && ( (Data.Type == GV_BINARY && Data.Value.Binary.Data != NULL) || (Stream.Get() != NULL) ); } }; /// Gets the document in format of a desired MIME type virtual bool GetFormattedContent ( /// [In] The desired mime type of the content const char *MimeType, /// [Out] The content in the specified mime type LString &Out, /// [Out/Optional] Any attached media files that the content references LArray *Media = NULL ) { return false; } }; /// Detects links in text, returning their location and type template bool LDetectLinks(LArray &Links, T *Text, ssize_t TextCharLen = -1) { if (!Text) return false; if (TextCharLen < 0) TextCharLen = Strlen(Text); T *End = Text + TextCharLen; static T Http[] = {'h', 't', 't', 'p', ':', '/', '/', 0 }; static T Https[] = {'h', 't', 't', 'p', 's', ':', '/', '/', 0}; for (int64 i=0; i= 7 && ( Strnicmp(Text+i, Http, 6) == 0 || Strnicmp(Text+i, Https, 7) == 0 ) ) { // find end T *s = Text + i; T *e = s + 6; for ( ; e < End && UrlChar(*e); e++) ; while ( e > s && ! ( IsAlpha(e[-1]) || IsDigit(e[-1]) || e[-1] == '/' ) ) e--; Links.New().Set(s - Text, e - s, false); i = e - Text; } break; } case '@': { // find start T *s = Text + (MAX(i, 1) - 1); for ( ; s > Text && EmailChar(*s); s--) ; if (s < Text + i) { if (!EmailChar(*s)) s++; bool FoundDot = false; T *Start = Text + i + 1; T *e = Start; for ( ; e < End && EmailChar(*e); e++) { if (*e == '.') FoundDot = true; } while (e > Start && e[-1] == '.') e--; if (FoundDot) { Links.New().Set(s - Text, e - s, true); i = e - Text; } } break; } } } return true; } diff --git a/src/common/Text/HtmlParser.cpp b/src/common/Text/HtmlParser.cpp --- a/src/common/Text/HtmlParser.cpp +++ b/src/common/Text/HtmlParser.cpp @@ -1,1602 +1,1602 @@ #include "lgi/common/Lgi.h" #include "lgi/common/DocView.h" #include "lgi/common/HtmlCommon.h" #include "lgi/common/HtmlParser.h" #include "lgi/common/Unicode.h" #include "lgi/common/Charset.h" #define FEATURE_REATTACH_ELEMENTS 1 #define IsBlock(d) ((d) == LCss::DispBlock) char *LHtmlParser::NextTag(char *s) { while (s && *s) { char *n = strchr(s, '<'); if (n) { if (!n[1]) return NULL; if (IsAlpha(n[1]) || strchr("!/", n[1]) || n[1] == '?') { return n; } s = n + 1; } else break; } return 0; } LHtmlElement *LHtmlParser::GetOpenTag(const char *Tag) { if (Tag) { for (int i=(int)OpenTags.Length()-1; i>=0; i--) { LHtmlElement *t = OpenTags[i]; if (t->Tag) { if (_stricmp(t->Tag, Tag) == 0) { return t; } if (_stricmp(t->Tag, "table") == 0) { // stop looking... we don't close tags outside // the table from inside. break; } } } } return 0; } void LHtmlParser::SkipNonDisplay(char *&s) { while (*s) { SkipWhiteSpace(s); if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-') { s += 4; char *e = strstr(s, "-->"); if (e) s = e + 3; else { s += strlen(s); break; } } else break; } } char16 *LHtmlParser::DecodeEntities(const char *s, ssize_t len) { char16 buf[256]; char16 *o = buf; const char *end = s + len; LStringPipe p(256); for (const char *i = s; i < end; ) { if (o - buf > CountOf(buf) - 32) { // We are getting near the end of the buffer... // push existing data into the LStringPipe and // reset the output ptr. p.Write(buf, (o - buf) * sizeof(*o) ); o = buf; } switch (*i) { case '&': { i++; if (*i == '#') { // Unicode Number char n[32] = "", *p = n; char16 Ch; i++; if (*i == 'x' || *i == 'X') { // Hex number i++; while ( *i && ( IsDigit(*i) || (*i >= 'A' && *i <= 'F') || (*i >= 'a' && *i <= 'f') ) && (p - n) < 31) { *p++ = *i++; } *p++ = 0; Ch = htoi(n); } else { // Decimal number while (*i && IsDigit(*i) && (p - n) < 31) { *p++ = *i++; } *p++ = 0; Ch = atoi(n); } if (Ch) *o++ = Ch; if (*i && *i != ';') i--; } else { // Named Char const char *e = i; while (*e && IsAlpha(*e) && *e != ';') { e++; } LAutoWString Var(Utf8ToWide(i, e-i)); uint32_t Char = LHtmlStatic::Inst->VarMap.Find(Var); if (Char) { *o++ = Char; i = e; } else { i--; *o++ = *i; } } break; } case '\r': { break; } case ' ': case '\t': case '\n': { *o++ = *i; break; } default: { // Normal char *o++ = *i; break; } } if (*i) i++; else break; } *o = 0; if (p.GetSize() > 0) { // Long string mode... use the LStringPipe p.Write(buf, (o - buf) * sizeof(*o)); return p.NewStrW(); } return NewStrW(buf, o - buf); } char *LHtmlParser::ParsePropValue(char *s, char16 *&Value) { Value = 0; if (s) { if (strchr("\"\'", *s)) { char Delim = *s++; char *Start = s; while (*s && *s != Delim) s++; Value = DecodeEntities(Start, s - Start); if (*s) s++; } else { char *Start = s; while (*s && !IsWhiteSpace(*s) && *s != '>') s++; Value = DecodeEntities(Start, s - Start); } if (Value && View && View->GetEnv()) { while (true) { auto Start = Strstr(Value, L""); if (!End) break; - LString Code = LString(Start + 2, End - Start - 2).Strip(); - LString Result = View->GetEnv()->OnDynamicContent(View, Code); - LString NewValue = LString(Value, Start - Value) + + auto Code = LString(Start + 2, End - Start - 2).Strip(); + auto Result = View->GetEnv()->OnDynamicContent(View, Code); + auto NewValue = LString(Value, Start - Value) + Result + LString(End + 2); DeleteArray(Value); Value = Utf8ToWide(NewValue); } } } return s; } char *LHtmlParser::ParseName(char *s, char **Name) { LAutoString a; s = ParseName(s, a); if (Name) *Name = a.Release(); return s; } char *LHtmlParser::ParseName(char *s, LAutoString &Name) { SkipWhiteSpace(s); char *Start = s; while (*s && (IsAlpha(*s) || strchr("!-:\"\'", *s) || IsDigit(*s))) { s++; } ssize_t Len = s - Start; if (Len > 0) { Name.Reset(NewStr(Start, Len)); } return s; } char *LHtmlParser::ParsePropList(char *s, LHtmlElement *Obj, bool &Closed) { while (s && *s) { while (*s && IsWhiteSpace(*s)) s++; if (*s == '/') { Closed = true; s++; } if (*s == '>' || !*s) break; // get name char *Name = 0; char *n = ParseName(s, &Name); if (!n || !*n) break; if (n == s) // Don't get stuck... s = ++n; else s = n; while (*s && IsWhiteSpace(*s)) s++; if (*s == '=') { // get value s++; while (*s && IsWhiteSpace(*s)) s++; char16 *Value = 0; s = ParsePropValue(s, Value); if (Name && Value && *Value) { #if defined(_DEBUG) if (!_stricmp(Name, "src")) { // printf("%s = %S\n", Name, Value); } #endif Obj->Set(Name, Value); } DeleteArray(Value); } DeleteArray(Name); while (*s && IsWhiteSpace(*s)) s++; if (!*s || *s == '<') return s; if (*s == '>' || *s == '/') break; } if (*s == '/') s++; if (*s == '>') s++; return s; } LHtmlElemInfo *LHtmlParser::GetTagInfo(const char *Tag) { LAssert(LHtmlStatic::Inst != NULL); return LHtmlStatic::Inst->GetTagInfo(Tag); } void DumpDomTree(LHtmlElement *e, int Depth = 0) { char Sp[256]; int d = Depth << 1; memset(Sp, ' ', d); Sp[d] = 0; LgiTrace("%s%s (%p,%p)\n", Sp, e->Tag.Get(), e, e->Parent); for (unsigned i=0; iChildren.Length(); i++) { DumpDomTree(e->Children[i], Depth+1); } } bool LHtmlParser::Parse(LHtmlElement *Root, const char *Doc) { /* if (Doc) { LFile f("c:\\tmp\\parser.html", O_WRITE); f.SetSize(0); f.Write(Doc, strlen(Doc)); } */ SourceData.Empty(); CurrentSrc = Doc; OpenTags.Length(0); ParseHtml(Root, (char*)Doc, 0); // DumpDomTree(Root); if (CurrentSrc) SourceData.Write(CurrentSrc, strlen(CurrentSrc)); Source.Reset(SourceData.NewStr()); return true; } char *LHtmlParser::ParseHtml(LHtmlElement *Elem, char *Doc, int Depth, bool InPreTag, bool *BackOut) { #if CRASH_TRACE LgiTrace("::ParseHtml Doc='%.10s'\n", Doc); #endif if (Depth >= 500) { // Bail return Doc + strlen(Doc); } bool IsFirst = true; for (char *s=Doc; s && *s; ) { char *StartTag = s; if (*s == '<') { if (s[1] == '?') { // Dynamic content // Write out the document before the dynamic section if (s > CurrentSrc) { SourceData.Write(CurrentSrc, s - CurrentSrc); } // Process dynamic section s += 2; while (*s && IsWhiteSpace(*s)) s++; if (_strnicmp(s, "xml:namespace", 13) == 0) { // Ignore Outlook generated HTML tag char *e = strchr(s, '/'); while (e) { if (e[1] == '>' || (e[1] == '?' && e[2] == '>')) { if (e[1] == '?') s = e + 3; else s = e + 2; break; } e = strchr(e + 1, '/'); } if (!e) LAssert(0); } else { char *Start = s; while (*s && (!(s[0] == '?' && s[1] == '>'))) { if (IsQuote(*s)) { char d = *s++; s = strchr(s, d); if (s) s++; else break; } else s++; } if (s) { if (s[0] == '?' && s[1] == '>' && View && View->GetEnv()) { char *e = s - 1; while (e > Start && IsWhiteSpace(*e)) e--; e++; LString Code(Start, e - Start); if (Code) { - LString Result = View->GetEnv()->OnDynamicContent(View, Code); + auto Result = View->GetEnv()->OnDynamicContent(View, Code); if (Result) { // Save the dynamic code to the source pipe - SourceData.Write(Result, Result.Length()); + SourceData.Write(Result); // Create some new elements based on the dynamically generated string char *p = Result; do { LHtmlElement *c = CreateElement(Elem); if (c) { p = ParseHtml(c, p, Depth + 1, InPreTag); } else break; } while (ValidStr(p)); } } s += 2; } } } // Move current position to after the dynamic section CurrentSrc = s; } else if (s[1] == '!' && s[2] == '-' && s[3] == '-') { // Comment s = strstr(s, "-->"); if (s) s += 3; } else if (s[1] == '!' && s[2] == '[') { // Parse conditional... char *StartTag = s; s += 3; char *Cond = 0; s = ParseName(s, &Cond); if (!Cond) { while (*s && *s != ']') s++; if (*s == ']') s++; if (*s == '>') s++; return s; } bool IsEndIf = false; if (!_stricmp(Cond, "if")) { if (!IsFirst) { DeleteArray(Cond); s = StartTag; goto DoChildTag; } Elem->TagId = CONDITIONAL; SkipWhiteSpace(s); char *Start = s; while (*s && *s != ']') s++; Elem->Condition.Reset(NewStr(Start, s-Start)); Elem->Tag.Reset(NewStr("[if]")); Elem->Info = GetTagInfo(Elem->Tag); if (!EvaluateCondition(Elem->Condition)) Elem->Display(LCss::DispNone); else Elem->Display(LCss::DispInline); OpenTags.Add(Elem); } else if (!_stricmp(Cond, "endif")) { LHtmlElement *MatchingIf = NULL; for (int i = (int)OpenTags.Length()-1; i>=0; i--) { LHtmlElement *e = OpenTags[i]; if (e->TagId == CONDITIONAL && e->Tag && !_stricmp(e->Tag, "[if]")) { MatchingIf = e; break; } } if (MatchingIf) { MatchingIf->WasClosed = true; IsEndIf = true; OpenTags.Delete(MatchingIf); } } DeleteArray(Cond); while (*s && *s != '>') s++; if (*s == '>') s++; if (IsEndIf) return s; } else if (s[1] == '!') { s += 2; s = strchr(s, '>'); if (s) s++; else return NULL; } else if (IsAlpha(s[1])) { // Start tag if (Elem->Parent && IsFirst) { // My tag s = ParseName(++s, Elem->Tag); if (!Elem->Tag) { if (BackOut) *BackOut = true; return s; } bool TagClosed = false; s = ParsePropList(s, Elem, TagClosed); #if 0 // _DEBUG int Depth = 0; for (LHtmlElement *ep = Elem; ep; ep=ep->Parent) { Depth++; } char Sp[256]; Depth<<=1; memset(Sp, ' ', Depth); Sp[Depth] = 0; LgiTrace("%s%s (this=%p, parent=%p)\n", Sp, Elem->Tag, Elem, Elem->Parent); #endif bool AlreadyOpen = false; Elem->Info = GetTagInfo(Elem->Tag); if (Elem->Info) { if (Elem->Info->Flags & LHtmlElemInfo::TI_SINGLETON) { // Do singleton check... we don't want nested BODY or HEAD tags... for (int i = (int)OpenTags.Length() - 1; i >= 0; i--) { LHtmlElement *e = OpenTags[i]; if (e->TagId == TAG_IFRAME) { // In the case of IFRAMEs... don't consider the parent document. break; } if (e->Tag && !_stricmp(e->Tag, Elem->Tag)) { AlreadyOpen = true; #if 0 // This dumps the tags in the list it = OpenTags.Start(); LgiTrace("Open tags:\n"); for (LHtmlElement *e = *it; e; e = *++it) { LAutoString a = e->DescribeElement(); LgiTrace("\t%s\n", a.Get()); } #endif break; } } } Elem->TagId = Elem->Info->Id; auto Dsp = TestFlag ( Elem->Info->Flags, LHtmlElemInfo::TI_BLOCK ) || ( Elem->Tag && Elem->Tag[0] == '!' ) ? LCss::DispBlock : LCss::DispInline; Elem->Display(Dsp); if (Elem->TagId == TAG_PRE) { InPreTag = true; } if (Elem->TagId == TAG_META) { LAutoString Cs; const char *s; if (Elem->Get("http-equiv", s) && _stricmp(s, "Content-Type") == 0) { const char *ContentType; if (Elem->Get("content", ContentType)) { char *CharSet = stristr(ContentType, "charset="); if (CharSet) { char16 *cs = NULL; ParsePropValue(CharSet + 8, cs); Cs.Reset(WideToUtf8(cs)); DeleteArray(cs); } } } if (Elem->Get("name", s) && _stricmp(s, "charset") == 0 && Elem->Get("content", s)) { Cs.Reset(NewStr(s)); } else if (Elem->Get("charset", s)) { Cs.Reset(NewStr(s)); } if (Cs) { if (Cs && _stricmp(Cs, "utf-16") != 0 && _stricmp(Cs, "utf-32") != 0 && LGetCsInfo(Cs)) { DocCharSet = Cs; } } } } if (IsBlock(Elem->Display()) || Elem->TagId == TAG_BR) { SkipNonDisplay(s); } Elem->SetStyle(); switch (Elem->TagId) { default: break; case TAG_SCRIPT: { char *End = stristr(s, ""); if (End) { if (View && View->GetEnv()) { *End = 0; LVariant Lang, Type; Elem->GetValue("language", Lang); Elem->GetValue("type", Type); View->GetEnv()->OnCompileScript(View, s, Lang.Str(), Type.Str()); *End = '<'; } else { Elem->Txt.Reset(Utf8ToWide(s, End - s)); } s = End; } break; } case TAG_TABLE: { if (Elem->Parent->TagId == TAG_TABLE) { // Um no... if (BackOut) { LHtmlElement *l = OpenTags.Last(); if (l && l->TagId == TAG_TABLE) CloseTag(l); *BackOut = true; return StartTag; } } break; } case TAG_STYLE: { char *End = stristr(s, ""); if (!End) { // wtf? Better come up with a plan b... for (End = s; *End; End++) { if (End[0] == '<' && IsAlpha(End[1])) break; } } if (End) { if (View) { LAutoString Css(NewStr(s, End - s)); if (Css) { View->OnAddStyle("text/css", Css); } } s = End; } else { // It might be better to let the style bleed into the // layout rather than miss the rest of the document? return s; } break; } } if (AlreadyOpen || TagClosed || Elem->Info->NeverCloses()) { return s; } #if FEATURE_REATTACH_ELEMENTS if (Elem->Info->Reattach) { LArray ParentTags; bool CloseExisting = false; switch (Elem->TagId) { case TAG_LI: { ParentTags.Add(TAG_OL); ParentTags.Add(TAG_UL); break; } case TAG_HEAD: { ParentTags.Add(TAG_HTML); break; } case TAG_BODY: { ParentTags.Add(TAG_HTML); break; } case TAG_TBODY: { ParentTags.Add(TAG_TABLE); CloseExisting = true; break; } case TAG_TR: { ParentTags.Add(TAG_TBODY); ParentTags.Add(TAG_TABLE); CloseExisting = true; break; } case TAG_TD: case TAG_TH: { ParentTags.Add(TAG_TR); CloseExisting = true; break; } default: break; } if (CloseExisting) { LHtmlElement *p; for (int TagIdx = (int)OpenTags.Length()-1; TagIdx >= 0 && (p = OpenTags[TagIdx]) && p->TagId != TAG_TABLE; TagIdx--) { if (p->TagId == Elem->TagId) { CloseTag(p); break; } } } bool Reattach = !ParentTags.HasItem(Elem->Parent->TagId); if (Reattach) { if (Elem->TagId == TAG_HEAD) { // Ignore it.. return s; } else { LHtmlElement *Parent = NULL; for (int TagIdx = (int)OpenTags.Length()-1; TagIdx >= 0; TagIdx--) { LHtmlElement *t = OpenTags[TagIdx]; if (t->TagId && ParentTags.HasItem(t->TagId)) { Parent = t; break; } if (t->TagId == TAG_TABLE) break; } if (Parent) { // Reattach to the right parent. #if 0 auto IsOpen = OpenTags.IndexOf(Elem->Parent); LgiTrace("Reattaching '%s'(%p) to '%s'(%p) (Old=%s %i)\n", Elem->Tag.Get(), Elem, Parent->Tag.Get(), Parent, Elem->Parent->Tag.Get(), IsOpen); #endif Parent->Attach(Elem); } else { // Maybe there is no parent tag? switch (Elem->TagId) { case TAG_TD: case TAG_TH: { // Find a TBODY or TABLE to attach a new ROW LHtmlElement *Attach = Elem->Parent; while (Attach) { if (Attach->TagId == TAG_TABLE || Attach->TagId == TAG_TBODY) break; Attach = Attach->Parent; } if (Attach) { // Create a new ROW LHtmlElement *NewRow = CreateElement(Attach); if (NewRow) { NewRow->Tag.Reset(NewStr("tr")); NewRow->TagId = TAG_TR; NewRow->Info = GetTagInfo(NewRow->Tag); bool IsAttach = Attach->Children.HasItem(NewRow); if (IsAttach) { OpenTags.Add(NewRow); NewRow->Attach(Elem); // LgiTrace("Inserted new TAG_TR: %p\n", NewRow); } } else LAssert(!"Alloc error"); } // else LAssert(!"What now?"); break; } default: { // LgiTrace("%s:%i - Warning: '%s' is missing it's parent.\n", _FL, Elem->Tag.Get()); break; } } } } } } #endif OpenTags.Add(Elem); if (Elem->TagId == TAG_IFRAME) { LVariant Src; if (Elem->GetValue("src", Src) && View && View->GetEnv()) { LDocumentEnv::LoadJob *j = View->GetEnv()->NewJob(); if (j) { j->Uri.Reset(Src.ReleaseStr()); j->Env = View->GetEnv(); j->UserData = Elem; j->UserUid = View ? View->GetDocumentUid() : 0; // LgiTrace("%s:%i - new job %p, %p\n", _FL, j, j->UserData); LDocumentEnv::LoadType Result = View->GetEnv()->GetContent(j); if (Result == LDocumentEnv::LoadImmediate) { LStreamI *s = j->GetStream(); if (s) { uint64 Len = s->GetSize(); if (Len > 0) { LAutoString a(new char[(size_t)Len+1]); ssize_t r = s->Read(a, (int)Len); a[r] = 0; LHtmlElement *Child = CreateElement(Elem); if (Child) { bool BackOut = false; LArray ot = OpenTags; ParseHtml(Child, a, Depth + 1, false, &BackOut); OpenTags = ot; } } } } DeleteObj(j); } } } } else { // Child tag DoChildTag: LHtmlElement *c = CreateElement(Elem); if (c) { bool BackOut = false; s = ParseHtml(c, s, Depth + 1, InPreTag, &BackOut); if (BackOut) { c->Detach(); DeleteObj(c); return s; } else if (IsBlock(c->Display())) { while (c->Children.Length()) { LHtmlElement *Last = c->Children.Last(); if (Last->TagId == CONTENT && !ValidStrW(Last->Txt)) { Last->Detach(); DeleteObj(Last); } else break; } } } } } else if (s[1] == '/') { // End tag char *PreTag = s; s += 2; while (*s == '/') s++; // This code segment detects out of order HTML tags // and skips them. If we didn't do this then the parser // would get stuck on a Tag which has already been closed // and would return to the very top of the recursion. // // e.g. // // // // char *EndBracket = strchr(s, '>'); if (EndBracket) { char *e = EndBracket; while (e > s && strchr(WhiteSpace, e[-1])) e--; LAutoString Name(NewStr(s, e - s)); LHtmlElement *Open = GetOpenTag(Name); if (Open) { Open->WasClosed = true; } else { s = EndBracket + 1; continue; } } else { s += strlen(s); continue; } if (Elem->Tag) { // Compare against our tag char *t = Elem->Tag; while (*s && *t && toupper(*s) == toupper(*t)) { s++; t++; } SkipWhiteSpace(s); if (*s == '>') { LHtmlElement *t; while ((t = OpenTags.Last())) { CloseTag(t); if (t == Elem || OpenTags.Length() == 0) { break; } } s++; if (IsBlock(Elem->Display()) || Elem->TagId == TAG_BR) { SkipNonDisplay(s); } if (Elem->Parent) { return s; } } } else { // Error case happens with borked HTML s = EndBracket + 1; } if (Elem->Parent) { return PreTag; } } else { goto PlainText; } } else if (*s) { // Text child PlainText: char *n = NextTag(s); ssize_t Len = n ? n - s : strlen(s); LAutoWString WStr(CleanText(s, Len, true, InPreTag)); if (WStr && *WStr) { // This loop processes the text into lengths that need different treatment enum TxtClass { TxtNone, TxtEmoji, TxtEol, TxtNull, }; char16 *Start = WStr; LHtmlElement *Child = NULL; for (char16 *c = WStr; true; c++) { TxtClass Cls = TxtNone; /* if (Html->d->DecodeEmoji && *c >= EMOJI_START && *c <= EMOJI_END) Cls = TxtEmoji; else */ if (InPreTag && *c == '\n') Cls = TxtEol; else if (!*c) Cls = TxtNull; if (Cls) { if (c > Start) { // Emit the text before the point of interest... LAutoWString Cur; if (Start == WStr && !*c) { // Whole string Cur = WStr; } else { // Sub-string Cur.Reset(NewStrW(Start, c - Start)); } if (Elem->Children.Length() == 0 && (!Elem->Info || !Elem->Info->NoText()) && !Elem->Txt) { Elem->Txt = Cur; } else if ((Child = CreateElement(Elem))) { Child->Txt = Cur; } } // Now process the text of interest... /* if (Cls == TxtEmoji) { // Emit the emoji image LHtmlElement *img = CreateElement(Elem); if (img) { img->Tag.Reset(NewStr("img")); if ((img->Info = GetTagInfo(img->Tag))) img->TagId = img->Info->Id; LRect rc; EMOJI_CH2LOC(*c, rc); img->Set("src", Html->d->EmojiImg); char css[256]; sprintf_s(css, sizeof(css), "x-rect: rect(%i,%i,%i,%i);", rc.y1, rc.x2, rc.y2, rc.x1); img->Set("style", css); img->SetStyle(); } Start = c + 1; } else */ if (Cls == TxtEol) { // Emit the
tag LHtmlElement *br = CreateElement(Elem); if (br) { br->Tag.Reset(NewStr("br")); if ((br->Info = GetTagInfo(br->Tag))) br->TagId = br->Info->Id; } Start = c + 1; } } // Check for the end of string... if (!*c) break; } } s = n; } IsFirst = false; } #if CRASH_TRACE LgiTrace("::ParseHtml end\n"); #endif return 0; } char16 *LHtmlParser::CleanText(const char *s, ssize_t Len, bool ConversionAllowed, bool KeepWhiteSpace) { static const char *DefaultCs = "iso-8859-1"; char16 *t = 0; if (s && Len > 0) { bool Has8 = false; if (Len >= 0) { for (int n = 0; n < Len; n++) { if (s[n] & 0x80) { Has8 = true; break; } } } else { for (int n = 0; s[n]; n++) { if (s[n] & 0x80) { Has8 = true; break; } } } bool DocAndCsTheSame = false; if (DocCharSet && View && View->GetCharset()) { DocAndCsTheSame = _stricmp(DocCharSet, View->GetCharset()) == 0; } if (!DocAndCsTheSame && DocCharSet && View && View->GetCharset() && !View->GetOverideDocCharset()) { const char *ViewCs = View->GetCharset(); char *DocText = (char*)LNewConvertCp(DocCharSet, s, ViewCs, Len); if (DocText) { t = (char16*) LNewConvertCp(LGI_WideCharset, DocText, DocCharSet, -1); DeleteArray(DocText); } else { // Can't convert to doc charset? iconv missing? t = Utf8ToWide(s, Len); } } else if (DocCharSet) { t = (char16*) LNewConvertCp(LGI_WideCharset, s, DocCharSet, Len); } else { const char *ViewCs = View ? View->GetCharset() : NULL; t = (char16*) LNewConvertCp(LGI_WideCharset, s, ViewCs?ViewCs:DefaultCs, Len); } if (t && ConversionAllowed) { char16 *o = t; for (char16 *i=t; *i; ) { switch (*i) { case '&': { i++; if (*i == '#') { // Unicode Number char n[32] = "", *p = n; i++; uint32_t Ch = 0; if (*i == 'x' || *i == 'X') { // Hex number i++; while ( *i && ( IsDigit(*i) || (*i >= 'A' && *i <= 'F') || (*i >= 'a' && *i <= 'f') ) && (p - n) < 31) { *p++ = (char)*i++; } *p = 0; Ch = htoi(n); } else { // Decimal number while (*i && IsDigit(*i) && (p - n) < 31) { *p++ = (char)*i++; } *p = 0; Ch = atoi(n); } if (Ch) { if (sizeof(*o) < 4) { ssize_t Len = 4; LgiUtf32To16(Ch, (uint16_t*&)o, Len); } else { *o++ = Ch; } } if (*i && *i != ';') i--; } else { // Named Char char16 *e = i; while (*e && IsAlpha(*e) && *e != ';') { e++; } LAutoWString Var(NewStrW(i, e-i)); char16 Char = LHtmlStatic::Inst->VarMap.Find(Var); if (Char) { *o++ = Char; i = e; } else { i--; *o++ = *i; } } break; } case '\r': { break; } case ' ': case '\t': case '\n': { if (KeepWhiteSpace) { *o++ = *i; } else { *o++ = ' '; // Skip furthur whitespace while (i[1] && IsWhiteSpace(i[1])) { i++; } } break; } default: { // Normal char *o++ = *i; break; } } if (*i) i++; else break; } *o++ = 0; } } if (t && !*t) { DeleteArray(t); } return t; } void LHtmlParser::_TraceOpenTags() { LStringPipe p; for (unsigned i = 0; i < OpenTags.Length(); i++) { LHtmlElement *t = OpenTags[i]; p.Print(", %s", t->Tag.Get()); LVariant Id; if (t->GetValue("id", Id)) { p.Print("#%s", Id.Str()); } } char *s = p.NewStr(); if (s) { LgiTrace("Open tags = '%s'\n", s + 2); DeleteArray(s); } } bool LHtmlParser::ParseColour(const char *s, LCss::ColorDef &c) { if (s) { int m; if (*s == '#') { s++; ParseHexColour: int i = htoi(s); size_t l = strlen(s); if (l == 3) { int r = i >> 8; int g = (i >> 4) & 0xf; int b = i & 0xf; c.Type = LCss::ColorRgb; c.Rgb32 = Rgb32(r | (r<<4), g | (g << 4), b | (b << 4)); } else if (l == 4) { int r = (i >> 12) & 0xf; int g = (i >> 8) & 0xf; int b = (i >> 4) & 0xf; int a = i & 0xf; c.Type = LCss::ColorRgb; c.Rgb32 = Rgba32( r | (r <<4 ), g | (g << 4), b | (b << 4), a | (a << 4)); } else if (l == 6) { c.Type = LCss::ColorRgb; c.Rgb32 = Rgb32(i >> 16, (i >> 8) & 0xff, i & 0xff); } else if (l == 8) { c.Type = LCss::ColorRgb; c.Rgb32 = Rgba32(i >> 24, (i >> 16) & 0xff, (i >> 8) & 0xff, i & 0xff); } else { return false; } return true; } else if ((m = LHtmlStatic::Inst->ColourMap.Find(s)) >= 0) { c.Type = LCss::ColorRgb; c.Rgb32 = Rgb24To32(m); return true; } else if (!_strnicmp(s, "rgb", 3)) { s += 3; SkipWhiteSpace(s); if (*s == '(') { s++; LArray Col; while (Col.Length() < 3) { SkipWhiteSpace(s); if (IsDigit(*s)) { Col.Add(atoi(s)); while (*s && IsDigit(*s)) s++; SkipWhiteSpace(s); if (*s == ',') s++; } else break; } SkipWhiteSpace(s); if (*s == ')' && Col.Length() == 3) { c.Type = LCss::ColorRgb; c.Rgb32 = Rgb32(Col[0], Col[1], Col[2]); return true; } } } else if (IsDigit(*s) || (tolower(*s) >= 'a' && tolower(*s) <= 'f')) { goto ParseHexColour; } } return false; } bool LHtmlParser::Is8Bit(char *s) { while (*s) { if (((uchar)*s) & 0x80) return true; s++; } return false; }