htmlparser修改html,HTMLparser

Details

htmlParserCtxttypedef xmlParserCtxt htmlParserCtxt;

htmlParserCtxtPtrtypedef xmlParserCtxtPtr htmlParserCtxtPtr;

htmlParserNodeInfotypedef xmlParserNodeInfo htmlParserNodeInfo;

htmlSAXHandlertypedef xmlSAXHandler htmlSAXHandler;

htmlSAXHandlerPtrtypedef xmlSAXHandlerPtr htmlSAXHandlerPtr;

htmlParserInputtypedef xmlParserInput htmlParserInput;

htmlParserInputPtrtypedef xmlParserInputPtr htmlParserInputPtr;

htmlDocPtrtypedef xmlDocPtr htmlDocPtr;

htmlNodePtrtypedef xmlNodePtr htmlNodePtr;

struct htmlElemDescstruct htmlElemDesc {

const char *name;/* The tag name */

char startTag; /* Whether the start tag can be implied */

char endTag; /* Whether the end tag can be implied */

char saveEndTag; /* Whether the end tag should be saved */

char empty; /* Is this an empty element ? */

char depr; /* Is this a deprecated element ? */

char dtd; /* 1: only in Loose DTD, 2: only Frameset one */

char isinline; /* is this a block 0 or inline 1 element */

const char *desc; /* the description */

};

htmlElemDescPtrtypedef htmlElemDesc *htmlElemDescPtr;

struct htmlEntityDescstruct htmlEntityDesc {

unsigned int value;/* the UNICODE value for the character */

const char *name;/* The entity name */

const char *desc; /* the description */

};

htmlEntityDescPtrtypedef htmlEntityDesc *htmlEntityDescPtr;

htmlTagLookup ()const htmlElemDesc* htmlTagLookup (const xmlChar *tag);

Lookup the HTML tag in the ElementTable

tag :The tag name in lowercase

Returns :the related htmlElemDescPtr or NULL if not found.

htmlEntityLookup ()const htmlEntityDesc* htmlEntityLookup (const xmlChar *name);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

name :the entity name

Returns :the associated htmlEntityDescPtr if found, NULL otherwise.

htmlEntityValueLookup ()const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);

Lookup the given entity in EntitiesTable

TODO: the linear scan is really ugly, an hash table is really needed.

value :the entity's unicode value

Returns :the associated htmlEntityDescPtr if found, NULL otherwise.

htmlIsAutoClosed ()int htmlIsAutoClosed (htmlDocPtr doc,

htmlNodePtr elem);

The HTML DTD allows a tag to implicitly close other tags.

The list is kept in htmlStartClose array. This function checks

if a tag is autoclosed by one of it's child

doc :the HTML document

elem :the HTML element

Returns :1 if autoclosed, 0 otherwise

htmlAutoCloseTag ()int htmlAutoCloseTag (htmlDocPtr doc,

const xmlChar *name,

htmlNodePtr elem);

The HTML DTD allows a tag to implicitly close other tags.

The list is kept in htmlStartClose array. This function checks

if the element or one of it's children would autoclose the

given tag.

doc :the HTML document

name :The tag name

elem :the HTML element

Returns :1 if autoclose, 0 otherwise

htmlParseEntityRef ()const htmlEntityDesc* htmlParseEntityRef (htmlParserCtxtPtr ctxt,

xmlChar **str);

parse an HTML ENTITY references

[68] EntityRef ::= '&' Name ';'

ctxt :an HTML parser context

str :location to store the entity name

Returns :the associated htmlEntityDescPtr if found, or NULL otherwise,

if non-NULL *str will have to be freed by the caller.

htmlParseCharRef ()int htmlParseCharRef (htmlParserCtxtPtr ctxt);

parse Reference declarations

[66] CharRef ::= '' [0-9]+ ';' |

'&x' [0-9a-fA-F]+ ';'

ctxt :an HTML parser context

Returns :the value parsed (as an int)

htmlParseElement ()void htmlParseElement (htmlParserCtxtPtr ctxt);

parse an HTML element, this is highly recursive

[39] element ::= EmptyElemTag | STag content ETag

[41] Attribute ::= Name Eq AttValue

ctxt :an HTML parser context

htmlParseDocument ()int htmlParseDocument (htmlParserCtxtPtr ctxt);

parse an HTML document (and build a tree if using the standard SAX

interface).

ctxt :an HTML parser context

Returns :0, -1 in case of error. the parser context is augmented

as a result of the parsing.

htmlSAXParseDoc ()htmlDocPtr htmlSAXParseDoc (xmlChar *cur,

const char *encoding,

htmlSAXHandlerPtr sax,

void *userData);

Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks

to handle parse events. If sax is NULL, fallback to the default DOM

behavior and return a tree.

cur :a pointer to an array of xmlChar

encoding :a free form C string describing the HTML document encoding, or NULL

sax :the SAX handler block

userData :if using SAX, this pointer will be provided on callbacks.

Returns :the resulting document tree unless SAX is NULL or the document is

not well formed.

htmlParseDoc ()htmlDocPtr htmlParseDoc (xmlChar *cur,

const char *encoding);

parse an HTML in-memory document and build a tree.

cur :a pointer to an array of xmlChar

encoding :a free form C string describing the HTML document encoding, or NULL

Returns :the resulting document tree

htmlSAXParseFile ()htmlDocPtr htmlSAXParseFile (const char *filename,

const char *encoding,

htmlSAXHandlerPtr sax,

void *userData);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress

compressed document is provided by default if found at compile-time.

It use the given SAX function block to handle the parsing callback.

If sax is NULL, fallback to the default DOM tree building routines.

filename :the filename

encoding :a free form C string describing the HTML document encoding, or NULL

sax :the SAX handler block

userData :if using SAX, this pointer will be provided on callbacks.

Returns :the resulting document tree unless SAX is NULL or the document is

not well formed.

htmlParseFile ()htmlDocPtr htmlParseFile (const char *filename,

const char *encoding);

parse an HTML file and build a tree. Automatic support for ZLIB/Compress

compressed document is provided by default if found at compile-time.

filename :the filename

encoding :a free form C string describing the HTML document encoding, or NULL

Returns :the resulting document tree

UTF8ToHtml ()int UTF8ToHtml (unsigned char *out,

int *outlen,

unsigned char *in,

int *inlen);

Take a block of UTF-8 chars in and try to convert it to an ASCII

plus HTML entities block of chars out.

out :a pointer to an array of bytes to store the result

outlen :the length of out

in :a pointer to an array of UTF-8 chars

inlen :the length of in

Returns :0 if success, -2 if the transcoding fails, or -1 otherwise

The value of inlen after return is the number of octets consumed

as the return value is positive, else unpredictable.

The value of outlen after return is the number of octets consumed.

htmlEncodeEntities ()int htmlEncodeEntities (unsigned char *out,

int *outlen,

unsigned char *in,

int *inlen,

int quoteChar);

Take a block of UTF-8 chars in and try to convert it to an ASCII

plus HTML entities block of chars out.

out :a pointer to an array of bytes to store the result

outlen :the length of out

in :a pointer to an array of UTF-8 chars

inlen :the length of in

quoteChar :the quote character to escape (' or ") or zero.

Returns :0 if success, -2 if the transcoding fails, or -1 otherwise

The value of inlen after return is the number of octets consumed

as the return value is positive, else unpredictable.

The value of outlen after return is the number of octets consumed.

htmlIsScriptAttribute ()int htmlIsScriptAttribute (const xmlChar *name);

Check if an attribute is of content type Script

name :an attribute name

Returns :1 is the attribute is a script 0 otherwise

htmlHandleOmittedElem ()int htmlHandleOmittedElem (int val);

Set and return the previous value for handling HTML omitted tags.

val :int 0 or 1

Returns :the last value for 0 for no handling, 1 for auto insertion.

htmlFreeParserCtxt ()void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);

Free all the memory used by a parser context. However the parsed

document in ctxt->myDoc is not freed.

ctxt :an HTML parser context

htmlCreatePushParserCtxt ()htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax,

void *user_data,

const char *chunk,

int size,

const char *filename,

xmlCharEncoding enc);

Create a parser context for using the HTML parser in push mode

To allow content encoding detection, size should be >= 4

The value of filename is used for fetching external entities

and error/warning reports.

sax :a SAX handler

user_data :The user data returned on SAX callbacks

chunk :a pointer to an array of chars

size :number of chars in the array

filename :an optional file name or URI

enc :an optional encoding

Returns :the new parser context or NULL

htmlParseChunk ()int htmlParseChunk (htmlParserCtxtPtr ctxt,

const char *chunk,

int size,

int terminate);

Parse a Chunk of memory

ctxt :an XML parser context

chunk :an char array

size :the size in byte of the chunk

terminate :last chunk indicator

Returns :zero if no error, the xmlParserErrors otherwise.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
按DOM模型解析html文件的工具包 已下是源码列表: META-INF/MANIFEST.MF META-INF/maven/org.htmlparser/htmlparser/pom.properties META-INF/maven/org.htmlparser/htmlparser/pom.xml org.htmlparser.Parser.class org.htmlparser.PrototypicalNodeFactory.class org.htmlparser.beans.BeanyBaby.class org.htmlparser.beans.FilterBean.class org.htmlparser.beans.HTMLLinkBean.class org.htmlparser.beans.HTMLTextBean.class org.htmlparser.beans.LinkBean.class org.htmlparser.beans.StringBean.class org.htmlparser.filters.AndFilter.class org.htmlparser.filters.CssSelectorNodeFilter.class org.htmlparser.filters.HasAttributeFilter.class org.htmlparser.filters.HasChildFilter.class org.htmlparser.filters.HasParentFilter.class org.htmlparser.filters.HasSiblingFilter.class org.htmlparser.filters.IsEqualFilter.class org.htmlparser.filters.LinkRegexFilter.class org.htmlparser.filters.LinkStringFilter.class org.htmlparser.filters.NodeClassFilter.class org.htmlparser.filters.NotFilter.class org.htmlparser.filters.OrFilter.class org.htmlparser.filters.RegexFilter.class org.htmlparser.filters.StringFilter.class org.htmlparser.filters.TagNameFilter.class org.htmlparser.http.HttpHeader.class org.htmlparser.sax.Attributes.class org.htmlparser.sax.Feedback.class org.htmlparser.sax.Locator.class org.htmlparser.sax.XMLReader.class org.htmlparser.scanners.CompositeTagScanner.class org.htmlparser.scanners.JspScanner.class org.htmlparser.scanners.ScriptDecoder.class org.htmlparser.scanners.ScriptScanner.class org.htmlparser.scanners.StyleScanner.class org.htmlparser.tags.AppletTag.class org.htmlparser.tags.BaseHrefTag.class org.htmlparser.tags.BlockquoteTag.class org.htmlparser.tags.BodyTag.class org.htmlparser.tags.Bullet.class org.htmlparser.tags.BulletList.class org.htmlparser.tags.CompositeTag.class org.htmlparser.tags.DefinitionList.class org.htmlparser.tags.DefinitionListBullet.class org.htmlparser.tags.Div.class org.htmlparser.tags.DoctypeTag.class org.htmlparser.tags.FormTag.class org.htmlparser.tags.FrameSetTag.class org.htmlparser.tags.FrameTag.class org.htmlparser.tags.HeadTag.class org.htmlparser.tags.HeadingTag.class org.htmlparser.tags.Html.class org.htmlparser.tags.ImageTag.class org.htmlparser.tags.InputTag.class org.htmlparser.tags.JspTag.class org.htmlparser.tags.LabelTag.class org.htmlparser.tags.LinkTag.class org.htmlparser.tags.MetaTag.class org.htmlparser.tags.ObjectTag.class org.htmlparser.tags.OptionTag.class org.htmlparser.tags.ParagraphTag.class org.htmlparser.tags.ProcessingInstructionTag.class org.htmlparser.tags.ScriptTag.class org.htmlparser.tags.SelectTag.class org.htmlparser.tags.Span.class org.htmlparser.tags.StyleTag.class org.htmlparser.tags.TableColumn.class org.htmlparser.tags.TableHeader.class org.htmlparser.tags.TableRow.class org.htmlparser.tags.TableTag.class org.htmlparser.tags.TextareaTag.class org.htmlparser.tags.TitleTag.class org.htmlparser.util.CharacterReference.class org.htmlparser.util.CharacterReferenceEx.class org.htmlparser.util.DefaultParserFeedback.class org.htmlparser.util.FeedbackManager.class org.htmlparser.util.IteratorImpl.class org.htmlparser.util.NodeTreeWalker.class org.htmlparser.util.ParserFeedback.class org.htmlparser.util.ParserUtils.class org.htmlparser.util.Translate.class org.htmlparser.visitors.HtmlPage.class org.htmlparser.visitors.LinkFindingVisitor.class org.htmlparser.visitors.ObjectFindingVisitor.class org.htmlparser.visitors.StringFindingVisitor.class org.htmlparser.visitors.TagFindingVisitor.class org.htmlparser.visitors.TextExtractingVisitor.class org.htmlparser.visitors.UrlModifyingVisitor.class org/htmlparser/beans/images/Chain16.gif org/htmlparser/beans/images/Chain32.gif org/htmlparser/beans/images/Knot16.gif org/htmlparser/beans/images/Knot32.gif
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值