htmlparser 编码问题

     有时候,在抓取网站的时候,网站的编码方式可能不统一,这样的情况,可能有些网页编码不成功,而htmlparser报了错,不能正常的读取。抛出来的异常为:org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old:  [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23

为了解决不管它是用何种编码方式,都能够正常读取数据。我在htmlparser的Page类中加了一个字段,之所以要在Page类中加,那是因为它历遍所有的标签过程中,把meta标签属性content捕获到了,并已经传值到setEncoding(String charset)办法中。

          代码如下:

 

// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.
// Jad home page: http://www.kpdus.com/jad.html
// Decompiler options: packimports(3) 
// Source File Name:   Page.java

package org.htmlparser.lexer;

import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;

// Referenced classes of package org.htmlparser.lexer:
//            InputStreamSource, PageIndex, StringSource, Cursor, 
//            Stream, Source

public class Page
    implements Serializable
{

    public Page()
    {
        this("");
    }

    public Page(URLConnection connection)
        throws ParserException
    {
        if(null == connection)
        {
            throw new IllegalArgumentException("connection cannot be null");
        } else
        {
            setConnection(connection);
            mBaseUrl = null;
            return;
        }
    }

    public Page(InputStream stream, String charset)
        throws UnsupportedEncodingException
    {
        if(null == stream)
            throw new IllegalArgumentException("stream cannot be null");
        if(null == charset)
            charset = "ISO-8859-1";
        mSource = new InputStreamSource(stream, charset);
        mIndex = new PageIndex(this);
        mConnection = null;
        mUrl = null;
        mBaseUrl = null;
    }

    public Page(String text, String charset)
    {
        if(null == text)
            throw new IllegalArgumentException("text cannot be null");
        if(null == charset)
            charset = "ISO-8859-1";
        mSource = new StringSource(text, charset);
        mIndex = new PageIndex(this);
        mConnection = null;
        mUrl = null;
        mBaseUrl = null;
    }

    public Page(String text)
    {
        this(text, null);
    }

    public Page(Source source)
    {
        if(null == source)
        {
            throw new IllegalArgumentException("source cannot be null");
        } else
        {
            mSource = source;
            mIndex = new PageIndex(this);
            mConnection = null;
            mUrl = null;
            mBaseUrl = null;
            return;
        }
    }

    public static ConnectionManager getConnectionManager()
    {
        return mConnectionManager;
    }

    public static void setConnectionManager(ConnectionManager manager)
    {
        mConnectionManager = manager;
    }

    public String getCharset(String content)
    {
        String CHARSET_STRING = "charset";
        String ret;
        if(null == mSource)
            ret = "ISO-8859-1";
        else
            ret = mSource.getEncoding();
        if(null != content)
        {
            int index = content.indexOf("charset");
            if(index != -1)
            {
                content = content.substring(index + "charset".length()).trim();
                if(content.startsWith("="))
                {
                    content = content.substring(1).trim();
                    index = content.indexOf(";");
                    if(index != -1)
                        content = content.substring(0, index);
                    if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
                        content = content.substring(1, content.length() - 1);
                    if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
                        content = content.substring(1, content.length() - 1);
                    ret = findCharset(content, ret);
                }
            }
        }
        return ret;
    }

    public static String findCharset(String name, String fallback)
    {
        String ret;
        try
        {
            Class cls = Class.forName("java.nio.charset.Charset");
            Method method = cls.getMethod("forName", new Class[] {
                java.lang.String.class
            });
            Object object = method.invoke(null, new Object[] {
                name
            });
            method = cls.getMethod("name", new Class[0]);
            object = method.invoke(object, new Object[0]);
            ret = (String)object;
        }
        catch(ClassNotFoundException cnfe)
        {
            ret = name;
        }
        catch(NoSuchMethodException nsme)
        {
            ret = name;
        }
        catch(IllegalAccessException ia)
        {
            ret = name;
        }
        catch(InvocationTargetException ita)
        {
            ret = fallback;
            System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
        }
        return ret;
    }

    private void writeObject(ObjectOutputStream out)
        throws IOException
    {
        if(null != getConnection())
        {
            out.writeBoolean(true);
            out.writeInt(mSource.offset());
            String href = getUrl();
            out.writeObject(href);
            setUrl(getConnection().getURL().toExternalForm());
            Source source = getSource();
            mSource = null;
            PageIndex index = mIndex;
            mIndex = null;
            out.defaultWriteObject();
            mSource = source;
            mIndex = index;
        } else
        {
            out.writeBoolean(false);
            String href = getUrl();
            out.writeObject(href);
            setUrl(null);
            out.defaultWriteObject();
            setUrl(href);
        }
    }

    private void readObject(ObjectInputStream in)
        throws IOException, ClassNotFoundException
    {
        boolean fromurl = in.readBoolean();
        if(fromurl)
        {
            int offset = in.readInt();
            String href = (String)in.readObject();
            in.defaultReadObject();
            if(null != getUrl())
            {
                URL url = new URL(getUrl());
                try
                {
                    setConnection(url.openConnection());
                }
                catch(ParserException pe)
                {
                    throw new IOException(pe.getMessage());
                }
            }
            Cursor cursor = new Cursor(this, 0);
            for(int i = 0; i < offset; i++)
                try
                {
                    getCharacter(cursor);
                }
                catch(ParserException pe)
                {
                    throw new IOException(pe.getMessage());
                }

            setUrl(href);
        } else
        {
            String href = (String)in.readObject();
            in.defaultReadObject();
            setUrl(href);
        }
    }

    public void reset()
    {
        getSource().reset();
        mIndex = new PageIndex(this);
    }

    public void close()
        throws IOException
    {
        if(null != getSource())
            getSource().destroy();
    }

    protected void finalize()
        throws Throwable
    {
        close();
    }

    public URLConnection getConnection()
    {
        return mConnection;
    }

    public void setConnection(URLConnection connection)
        throws ParserException
    {
        mConnection = connection;
        mConnection.setConnectTimeout(6000);
        mConnection.setReadTimeout(6000);
        try
        {
            getConnection().connect();
        }
        catch(UnknownHostException uhe)
        {
            throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
        }
        catch(IOException ioe)
        {
            throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
        }
        String type = getContentType();
        String charset = getCharset(type);
        try
        {
            String contentEncoding = connection.getContentEncoding();
            System.out.println("contentEncoding="+contentEncoding);
            Stream stream;
            if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
                stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
            else
            if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
                stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
            else{
                stream = new Stream(getConnection().getInputStream());
            }

            try
            {
		/*
            	 * 时间:2008年12月23日
            	 * 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
            	 */
            	if(charset.indexOf("ISO-8859-1")!=-1){
            		charset = getQICHAODEFAULT_CHARSET();
            	}
		 mSource = new InputStreamSource(stream, charset);
            }
            catch(UnsupportedEncodingException uee)
            {
                charset = "ISO-8859-1";
                mSource = new InputStreamSource(stream, charset);
            }
        }
        catch(IOException ioe)
        {
            throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
        }
        mUrl = connection.getURL().toExternalForm();
		mIndex = new PageIndex(this);
    }

    public String getUrl()
    {
        return mUrl;
    }

    public void setUrl(String url)
    {
        mUrl = url;
    }

    public String getBaseUrl()
    {
        return mBaseUrl;
    }

    public void setBaseUrl(String url)
    {
        mBaseUrl = url;
    }

    public Source getSource()
    {
        return mSource;
    }

    public String getContentType()
    {
        String ret = "text/html";
        URLConnection connection = getConnection();
        if(null != connection)
        {
            String content = connection.getHeaderField("Content-Type");
            if(null != content)
                ret = content;
        }
        return ret;
    }

    public char getCharacter(Cursor cursor)
        throws ParserException
    {
        int i = cursor.getPosition();
        int offset = mSource.offset();
        char ret;
        if(offset == i)
            try
            {
                i = mSource.read();
                if(-1 == i)
                {
                    ret = '\uFFFF';
                } else
                {
                    ret = (char)i;
                    cursor.advance();
                }
            }
            catch(IOException ioe)
            {
                throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
            }
        else
        if(offset > i)
        {
            try
            {
                ret = mSource.getCharacter(i);
            }
            catch(IOException ioe)
            {
                throw new ParserException("can't read a character at position " + i, ioe);
            }
            cursor.advance();
        } else
        {
            throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
        }
        if('\r' == ret)
        {
            ret = '\n';
            if(mSource.offset() == cursor.getPosition())
                try
                {
                    i = mSource.read();
                    if(-1 != i)
                        if('\n' == (char)i)
                            cursor.advance();
                        else
                            try
                            {
                                mSource.unread();
                            }
                            catch(IOException ioe)
                            {
                                throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
                            }
                }
                catch(IOException ioe)
                {
                    throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
                }
            else
                try
                {
                    if('\n' == mSource.getCharacter(cursor.getPosition()))
                        cursor.advance();
                }
                catch(IOException ioe)
                {
                    throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
                }
        }
        if('\n' == ret)
            mIndex.add(cursor);
        return ret;
    }

    public void ungetCharacter(Cursor cursor)
        throws ParserException
    {
        cursor.retreat();
        int i = cursor.getPosition();
        try
        {
            char ch = mSource.getCharacter(i);
            if('\n' == ch && 0 != i)
            {
                ch = mSource.getCharacter(i - 1);
                if('\r' == ch)
                    cursor.retreat();
            }
        }
        catch(IOException ioe)
        {
            throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
        }
    }

    public String getEncoding()
    {
        return getSource().getEncoding();
    }

    public void setEncoding(String character_set)
        throws ParserException
    {
    	this.QICHAODEFAULT_CHARSET = character_set;
        getSource().setEncoding(character_set);
    }

    public URL constructUrl(String link, String base)
        throws MalformedURLException
    {
        return constructUrl(link, base, false);
    }

    public URL constructUrl(String link, String base, boolean strict)
        throws MalformedURLException
    {
        int index;
        URL url;
        if(!strict && '?' == link.charAt(0))
        {
            if(-1 != (index = base.lastIndexOf('?')))
                base = base.substring(0, index);
            url = new URL(base + link);
        } else
        {
            url = new URL(new URL(base), link);
        }
        String path = url.getFile();
        boolean modified = false;
        boolean absolute = link.startsWith("/");
        if(!absolute)
            do
            {
                if(!path.startsWith("/."))
                    break;
                if(path.startsWith("/../"))
                {
                    path = path.substring(3);
                    modified = true;
                    continue;
                }
                if(!path.startsWith("/./") && !path.startsWith("/."))
                    break;
                path = path.substring(2);
                modified = true;
            } while(true);
        while(-1 != (index = path.indexOf("/\\"))) 
        {
            path = path.substring(0, index + 1) + path.substring(index + 2);
            modified = true;
        }
        if(modified)
            url = new URL(url, path);
        return url;
    }

    public String getAbsoluteURL(String link)
    {
        return getAbsoluteURL(link, false);
    }

    public String getAbsoluteURL(String link, boolean strict)
    {
        String ret;
        if(null == link || "".equals(link))
            ret = "";
        else
            try
            {
                String base = getBaseUrl();
                if(null == base)
                    base = getUrl();
                if(null == base)
                {
                    ret = link;
                } else
                {
                    URL url = constructUrl(link, base, strict);
                    ret = url.toExternalForm();
                }
            }
            catch(MalformedURLException murle)
            {
                ret = link;
            }
        return ret;
    }

    public int row(Cursor cursor)
    {
        return mIndex.row(cursor);
    }

    public int row(int position)
    {
        return mIndex.row(position);
    }

    public int column(Cursor cursor)
    {
        return mIndex.column(cursor);
    }

    public int column(int position)
    {
        return mIndex.column(position);
    }

    public String getText(int start, int end)
        throws IllegalArgumentException
    {
        String ret;
        try
        {
            ret = mSource.getString(start, end - start);
        }
        catch(IOException ioe)
        {
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        }
        return ret;
    }

    public void getText(StringBuffer buffer, int start, int end)
        throws IllegalArgumentException
    {
        if(mSource.offset() < start || mSource.offset() < end)
            throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
        int length;
        if(end < start)
        {
            length = end;
            end = start;
            start = length;
        }
        length = end - start;
        try
        {
            mSource.getCharacters(buffer, start, length);
        }
        catch(IOException ioe)
        {
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        }
    }

    public String getText()
    {
        return getText(0, mSource.offset());
    }

    public void getText(StringBuffer buffer)
    {
        getText(buffer, 0, mSource.offset());
    }

    public void getText(char array[], int offset, int start, int end)
        throws IllegalArgumentException
    {
        if(mSource.offset() < start || mSource.offset() < end)
            throw new IllegalArgumentException("attempt to extract future characters from source");
        int length;
        if(end < start)
        {
            length = end;
            end = start;
            start = length;
        }
        length = end - start;
        try
        {
            mSource.getCharacters(array, offset, start, end);
        }
        catch(IOException ioe)
        {
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        }
    }

    public String getLine(Cursor cursor)
    {
        int line = row(cursor);
        int size = mIndex.size();
        int start;
        int end;
        if(line < size)
        {
            start = mIndex.elementAt(line);
            if(++line <= size)
                end = mIndex.elementAt(line);
            else
                end = mSource.offset();
        } else
        {
            start = mIndex.elementAt(line - 1);
            end = mSource.offset();
        }
        return getText(start, end);
    }

    public String getLine(int position)
    {
        return getLine(new Cursor(this, position));
    }

    public String toString()
    {
        String ret;
        if(mSource.offset() > 0)
        {
            StringBuffer buffer = new StringBuffer(43);
            int start = mSource.offset() - 40;
            if(0 > start)
                start = 0;
            else
                buffer.append("...");
            getText(buffer, start, mSource.offset());
            ret = buffer.toString();
        } else
        {
            ret = super.toString();
        }
        return ret;
    }

    public static final String DEFAULT_CHARSET = "ISO-8859-1";
    public static String QICHAODEFAULT_CHARSET = "gb2312";
    public static final String DEFAULT_CONTENT_TYPE = "text/html";
    public static final char EOF = 65535;
    protected String mUrl;
    protected String mBaseUrl;
    protected Source mSource;
    protected PageIndex mIndex;
    protected transient URLConnection mConnection;
    protected static ConnectionManager mConnectionManager = new ConnectionManager();
	public static String getQICHAODEFAULT_CHARSET() {
		return QICHAODEFAULT_CHARSET;
	}

}
 

在调用的时候,代码如下:

			Parser parser = new Parser(url);
			parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET());

  一般情况下,设置成这样应该是没问题的啦,但是,你有时候看到的编码方式并不一定是它该网页的编码方式。比如说,肉眼看到页面中有<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />,htmlparser可以正常的获取到,代码如下:

    public void doSemanticAction()
        throws ParserException
    {
        String httpEquiv = getHttpEquiv();
        if("Content-Type".equalsIgnoreCase(httpEquiv))
        {
            String charset = getPage().getCharset(getAttribute("CONTENT"));
            getPage().setEncoding(charset);
        }
    }

 但是,你不要认为这个就是它的编码方式啦,在htmlparser,还进行了一次判断,在类Page中,有个方法是获取报头字段Content-Type的。代码如下:

    public String getContentType()
    {
        String ret = "text/html";
        URLConnection connection = getConnection();
        if(null != connection)
        {
            String content = connection.getHeaderField("Content-Type");
            if(null != content)
                ret = content;
        }
        return ret;
    }

 两个进行比较,如果不一样的话,它就报了

org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old:  [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23

 ,所以,我在类InputStreamSource再一次进行修改:

代码如下:

    public void setEncoding(String character_set)
        throws ParserException
    {
        String encoding = getEncoding();
        /**
         * time:2008年12月23日
         */
        if(encoding!=null){
        	character_set = encoding;
        }
        if(!encoding.equalsIgnoreCase(character_set))
        {
            InputStream stream = getStream();
            try
            {
                char buffer[] = mBuffer;
                int offset = mOffset;
                stream.reset();
                try
                {
                    mEncoding = character_set;
                    mReader = new InputStreamReader(stream, character_set);
                    mBuffer = new char[mBuffer.length];
                    mLevel = 0;
                    mOffset = 0;
                    mMark = -1;
                    if(0 != offset)
                    {
                        char new_chars[] = new char[offset];
                        if(offset != read(new_chars))
                            throw new ParserException("reset stream failed");
                        for(int i = 0; i < offset; i++)
                            if(new_chars[i] != buffer[i])
                                throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);

                    }
                }
                catch(IOException ioe)
                {
                    throw new ParserException(ioe.getMessage(), ioe);
                }
            }
            catch(IOException ioe)
            {
                throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
            }
        }
    }

 这样应该来说,不管什么方式都OK的。

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
org.htmlparser.Tag org.htmlparser.Node org.htmlparser.Text org.htmlparser.Parser org.htmlparser.Remark org.htmlparser.tags.Div org.htmlparser.Attribute org.htmlparser.tags.Html org.htmlparser.tags.Span org.htmlparser.NodeFilter org.htmlparser.lexer.Page org.htmlparser.NodeFactory org.htmlparser.http.Cookie org.htmlparser.lexer.Lexer org.htmlparser.sax.Locator org.htmlparser.tags.Bullet org.htmlparser.tags.JspTag org.htmlparser.lexer.Cursor org.htmlparser.lexer.Source org.htmlparser.lexer.Stream org.htmlparser.sax.Feedback org.htmlparser.tags.BodyTag org.htmlparser.tags.FormTag org.htmlparser.tags.HeadTag org.htmlparser.tags.LinkTag org.htmlparser.tags.MetaTag org.htmlparser.nodes.TagNode org.htmlparser.sax.XMLReader org.htmlparser.tags.FrameTag org.htmlparser.tags.ImageTag org.htmlparser.tags.InputTag org.htmlparser.tags.LabelTag org.htmlparser.tags.StyleTag org.htmlparser.tags.TableRow org.htmlparser.tags.TableTag org.htmlparser.tags.TitleTag org.htmlparser.util.NodeList org.htmlparser.beans.LinkBean org.htmlparser.nodes.TextNode org.htmlparser.sax.Attributes org.htmlparser.tags.AppletTag org.htmlparser.tags.ObjectTag org.htmlparser.tags.OptionTag org.htmlparser.tags.ScriptTag org.htmlparser.tags.SelectTag org.htmlparser.util.Translate org.htmlparser.util.sort.Sort org.htmlparser.beans.BeanyBaby org.htmlparser.http.HttpHeader org.htmlparser.lexer.PageIndex org.htmlparser.tags.BulletList org.htmlparser.tags.DoctypeTag org.htmlparser.tags.HeadingTag org.htmlparser.util.NodeList$1 org.htmlparser.beans.FilterBean org.htmlparser.beans.StringBean org.htmlparser.filters.OrFilter org.htmlparser.nodes.RemarkNode org.htmlparser.scanners.Scanner org.htmlparser.tags.BaseHrefTag org.htmlparser.tags.FrameSetTag org.htmlparser.tags.TableColumn org.htmlparser.tags.TableHeader org.htmlparser.tags.TextareaTag org.htmlparser.util.ParserUtils org.htmlparser.beans.BeanyBaby$1 org.htmlparser.filters.AndFilter org.htmlparser.filters.NotFilter org.htmlparser.filters.XorFilter org.htmlparser.tags.CompositeTag org.htmlparser.tags.ParagraphTag org.htmlparser.util.IteratorImpl org.htmlparser.util.NodeIterator org.htmlparser.visitors.HtmlPage org.htmlparser.util.sort.Ordered org.htmlparser.beans.HTMLLinkBean org.htmlparser.beans.HTMLTextBean org.htmlparser.lexer.StringSource org.htmlparser.nodes.AbstractNode org.htmlparser.util.sort.Sortable org.htmlparser.filters.RegexFilter org.htmlparser.lexer.PageAttribute org.htmlparser.scanners.JspScanner org.htmlparser.scanners.TagScanner org.htmlparser.tags.DefinitionList org.htmlparser.util.NodeTreeWalker org.htmlparser.util.ParserFeedback org.htmlparser.filters.StringFilter org.htmlparser.util.FeedbackManager org.htmlparser.util.ParserException org.htmlparser.visitors.NodeVisitor org.htmlparser.filters.IsEqualFilter org.htmlparser.filters.TagNameFilter org.htmlparser.scanners.StyleScanner org.htmlparser.util.ChainedException org.htmlparser.filters.HasChildFilter org.htmlparser.http.ConnectionManager org.htmlparser.http.ConnectionMonitor org.htmlparser.scanners.ScriptDecoder org.htmlparser.scanners.ScriptScanner org.htmlparser.PrototypicalNodeFactory org.htmlparser.filters.HasParentFilter org.htmlparser.filters.LinkRegexFilter org.htmlparser.filters.NodeClassFilter org.htmlparser.lexer.InputStreamSource org.htmlparser.util.CharacterReference org.htmlparser.util.SimpleNodeIterator org.htmlparser.filters.HasSiblingFilter org.htmlparser.filters.LinkStringFilter org.htmlparser.tags.DefinitionListBullet org.htmlparser.util.CharacterReferenceEx org.htmlparser.filters.HasAttributeFilter org.htmlparser.util.DefaultParserFeedback org.htmlparser.visitors.TagFindingVisitor org.htmlparser.visitors.LinkFindingVisitor org.htmlparser.scanners.CompositeTagScanner org.htmlparser.util.EncodingChangeException org.htmlparser.visitors.UrlModifyingVisitor org.htmlparser.filters.CssSelectorNodeFilter org.htmlparser.tags.ProcessingInstructionTag org.htmlparser.visitors.ObjectFindingVisitor org.htmlparser.visitors.StringFindingVisitor org.htmlparser.visitors.TextExtractingVisitor org.htmlparser.filters.CssSelectorNodeFilter$1 org.htmlparser.parserapplications.SiteCapturer org.htmlparser.parserapplications.WikiCapturer org.htmlparser.parserapplications.LinkExtractor org.htmlparser.parserapplications.LinkExtractor$1 org.htmlparser.parserapplications.StringExtractor org.htmlparser.filters.CssSelectorNodeFilter$YesFilter org.htmlparser.parserapplications.filterbuilder.Filter org.htmlparser.filters.CssSelectorNodeFilter$AdjacentFilter org.htmlparser.parserapplications.SiteCapturer$LocalLinkTag org.htmlparser.parserapplications.SiteCapturer$LocalFrameTag org.htmlparser.parserapplications.SiteCapturer$LocalImageTag org.htmlparser.parserapplications.filterbuilder.FilterBuilder org.htmlparser.parserapplications.filterbuilder.HtmlTreeModel org.htmlparser.parserapplications.filterbuilder.SubFilterList org.htmlparser.filters.CssSelectorNodeFilter$AttribMatchFilter org.htmlparser.filters.CssSelectorNodeFilter$HasAncestorFilter org.htmlparser.parserapplications.SiteCapturer$LocalBaseHrefTag org.htmlparser.parserapplications.filterbuilder.HtmlTreeCellRenderer org.htmlparser.parserapplications.filterbuilder.wrappers.OrFilterWrapper org.htmlparser.parserapplications.filterbuilder.layouts.NullLayoutManager org.htmlparser.parserapplications.filterbuilder.wrappers.AndFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.NotFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.RegexFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.StringFilterWrapper org.htmlparser.parserapplications.filterbuilder.layouts.VerticalLayoutManager org.htmlparser.parserapplications.filterbuilder.wrappers.TagNameFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasChildFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasParentFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.NodeClassFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasSiblingFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasAttributeFilterWrapper
按DOM模型解析html文件的工具包 已下是源码列表: META-INF/MANIFEST.MF META-INF/maven/org.htmlparser/htmlparser/pom.properties META-INF/maven/org.htmlparser/htmlparser/pom.xml org.htmlparser.Parser.class org.htmlparser.PrototypicalNodeFactory.class org.htmlparser.beans.BeanyBaby.class org.htmlparser.beans.FilterBean.class org.htmlparser.beans.HTMLLinkBean.class org.htmlparser.beans.HTMLTextBean.class org.htmlparser.beans.LinkBean.class org.htmlparser.beans.StringBean.class org.htmlparser.filters.AndFilter.class org.htmlparser.filters.CssSelectorNodeFilter.class org.htmlparser.filters.HasAttributeFilter.class org.htmlparser.filters.HasChildFilter.class org.htmlparser.filters.HasParentFilter.class org.htmlparser.filters.HasSiblingFilter.class org.htmlparser.filters.IsEqualFilter.class org.htmlparser.filters.LinkRegexFilter.class org.htmlparser.filters.LinkStringFilter.class org.htmlparser.filters.NodeClassFilter.class org.htmlparser.filters.NotFilter.class org.htmlparser.filters.OrFilter.class org.htmlparser.filters.RegexFilter.class org.htmlparser.filters.StringFilter.class org.htmlparser.filters.TagNameFilter.class org.htmlparser.http.HttpHeader.class org.htmlparser.sax.Attributes.class org.htmlparser.sax.Feedback.class org.htmlparser.sax.Locator.class org.htmlparser.sax.XMLReader.class org.htmlparser.scanners.CompositeTagScanner.class org.htmlparser.scanners.JspScanner.class org.htmlparser.scanners.ScriptDecoder.class org.htmlparser.scanners.ScriptScanner.class org.htmlparser.scanners.StyleScanner.class org.htmlparser.tags.AppletTag.class org.htmlparser.tags.BaseHrefTag.class org.htmlparser.tags.BlockquoteTag.class org.htmlparser.tags.BodyTag.class org.htmlparser.tags.Bullet.class org.htmlparser.tags.BulletList.class org.htmlparser.tags.CompositeTag.class org.htmlparser.tags.DefinitionList.class org.htmlparser.tags.DefinitionListBullet.class org.htmlparser.tags.Div.class org.htmlparser.tags.DoctypeTag.class org.htmlparser.tags.FormTag.class org.htmlparser.tags.FrameSetTag.class org.htmlparser.tags.FrameTag.class org.htmlparser.tags.HeadTag.class org.htmlparser.tags.HeadingTag.class org.htmlparser.tags.Html.class org.htmlparser.tags.ImageTag.class org.htmlparser.tags.InputTag.class org.htmlparser.tags.JspTag.class org.htmlparser.tags.LabelTag.class org.htmlparser.tags.LinkTag.class org.htmlparser.tags.MetaTag.class org.htmlparser.tags.ObjectTag.class org.htmlparser.tags.OptionTag.class org.htmlparser.tags.ParagraphTag.class org.htmlparser.tags.ProcessingInstructionTag.class org.htmlparser.tags.ScriptTag.class org.htmlparser.tags.SelectTag.class org.htmlparser.tags.Span.class org.htmlparser.tags.StyleTag.class org.htmlparser.tags.TableColumn.class org.htmlparser.tags.TableHeader.class org.htmlparser.tags.TableRow.class org.htmlparser.tags.TableTag.class org.htmlparser.tags.TextareaTag.class org.htmlparser.tags.TitleTag.class org.htmlparser.util.CharacterReference.class org.htmlparser.util.CharacterReferenceEx.class org.htmlparser.util.DefaultParserFeedback.class org.htmlparser.util.FeedbackManager.class org.htmlparser.util.IteratorImpl.class org.htmlparser.util.NodeTreeWalker.class org.htmlparser.util.ParserFeedback.class org.htmlparser.util.ParserUtils.class org.htmlparser.util.Translate.class org.htmlparser.visitors.HtmlPage.class org.htmlparser.visitors.LinkFindingVisitor.class org.htmlparser.visitors.ObjectFindingVisitor.class org.htmlparser.visitors.StringFindingVisitor.class org.htmlparser.visitors.TagFindingVisitor.class org.htmlparser.visitors.TextExtractingVisitor.class org.htmlparser.visitors.UrlModifyingVisitor.class org/htmlparser/beans/images/Chain16.gif org/htmlparser/beans/images/Chain32.gif org/htmlparser/beans/images/Knot16.gif org/htmlparser/beans/images/Knot32.gif
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值