htmlparser的编码问题

最新推荐文章于 2021-06-03 03:38:03 发布

会飞的五花肉

最新推荐文章于 2021-06-03 03:38:03 发布

阅读量411

点赞数

分类专栏： htmlparser-spider 文章标签： UP

本文链接：https://blog.csdn.net/gb5332360/article/details/83681447

版权

htmlparser-spider 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

htmlparser在提取网站内容时，有时会出现乱码或者是编码不能转换的问题。这是htmlparser的一个小bug，因为htmlparser作为一个开源软件已经很长时间没有更新了。

org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23或者会出现页面的乱码问题。
为了彻底避免上述问题，我们可以改下htmlparser的源码的两个类。
package org.htmlparser.lexer;和InputStreamSource类。另外我们还要用 CodepageDetectorProxy（根据二进制流来分析网页编码）来提前解析网页编码。
htmlparser中设置编码一般为
Parser parser=new Parser(url);
parser.setEnconding("bianma");
但存在漏洞。
htmlparser编码的分析过程：htmlparser会根据服务器返回的文件头信息与网页的meta标签中的编码进行对比，如果服务器返回的文件头编码为空，默认返回为ISO-8859-1的编码，它会meta标签的charset里的编码对比。

改进的思路：利用CodepageDetectorProxy.jar对网页进行编码分析，获得网页的编码格式。将htmlparser的服务器返回的默认编码设置为CodepageDetectorProxy解析的编码。这样的编码和meta标签的编码总能保持一致了。。代码如下：
整个修改过程如下：


import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.ParsingDetector;
import java.net.MalformedURLException;
import java.net.URL;
import org.htmlparser.lexer.Page;

public class WebEncoding {

	public String AnalyEnconding(String path){
	URL url=null;
	try {
		url=new URL(path);
	} catch (MalformedURLException e) {
		e.printStackTrace();
	}
	CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance(); 
	detector.add(new ParsingDetector(false)); 
	java.nio.charset.Charset charset = null;  
	try {  
	      charset = detector.detectCodepage(url);  
	} catch (Exception ex) {ex.printStackTrace();}
	 if(charset.name().equalsIgnoreCase("utf-8")||charset.name().equals("UTF-8")){
		 Page.GaoBinDEFAULT_CHARSET="utf-8";
      }else{
    	  Page.GaoBinDEFAULT_CHARSET="gb2312";
     }
	   return Page.getGaoBinDEFAULT_CHARSET();
	}
}


package org.htmlparser.lexer;
import java.io.*;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.*;
import java.util.zip.*;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.util.ParserException;

// Referenced classes of package org.htmlparser.lexer:
//            InputStreamSource, PageIndex, StringSource, Cursor, 
//            Stream, Source

public class Page
    implements Serializable
{

    public Page()
    {
        this("");
    }

    public Page(URLConnection connection)
        throws ParserException
    {
        if(null == connection)
        {
            throw new IllegalArgumentException("connection cannot be null");
        } else
        {
            setConnection(connection);
            mBaseUrl = null;
            return;
        }
    }

    public Page(InputStream stream, String charset)
        throws UnsupportedEncodingException
    {
        if(null == stream)
            throw new IllegalArgumentException("stream cannot be null");
        if(null == charset)
            charset = "ISO-8859-1";
        mSource = new InputStreamSource(stream, charset);
        mIndex = new PageIndex(this);
        mConnection = null;
        mUrl = null;
        mBaseUrl = null;
    }

    public Page(String text, String charset)
    {
        if(null == text)
            throw new IllegalArgumentException("text cannot be null");
        if(null == charset)
            charset = "ISO-8859-1";
        mSource = new StringSource(text, charset);
        mIndex = new PageIndex(this);
        mConnection = null;
        mUrl = null;
        mBaseUrl = null;
    }

    public Page(String text)
    {
        this(text, null);
    }

    public Page(Source source)
    {
        if(null == source)
        {
            throw new IllegalArgumentException("source cannot be null");
        } else
        {
            mSource = source;
            mIndex = new PageIndex(this);
            mConnection = null;
            mUrl = null;
            mBaseUrl = null;
            return;
        }
    }

    public static ConnectionManager getConnectionManager()
    {
        return mConnectionManager;
    }

    public static void setConnectionManager(ConnectionManager manager)
    {
        mConnectionManager = manager;
    }

    public String getCharset(String content)
    {
        String CHARSET_STRING = "charset";
        String ret;
        if(null == mSource)
            ret = "ISO-8859-1";
        else
            ret = mSource.getEncoding();
        if(null != content)
        {
            int index = content.indexOf("charset");
            if(index != -1)
            {
                content = content.substring(index + "charset".length()).trim();
                if(content.startsWith("="))
                {
                    content = content.substring(1).trim();
                    index = content.indexOf(";");
                    if(index != -1)
                        content = content.substring(0, index);
                    if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
                        content = content.substring(1, content.length() - 1);
                    if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
                        content = content.substring(1, content.length() - 1);
                    ret = findCharset(content, ret);
                }
            }
        }
        return ret;
    }

    public static String findCharset(String name, String fallback)
    {
        String ret;
        try
        {
            Class cls = Class.forName("java.nio.charset.Charset");
            Method method = cls.getMethod("forName", new Class[] {
                java.lang.String.class
            });
            Object object = method.invoke(null, new Object[] {
                name
            });
            method = cls.getMethod("name", new Class[0]);
            object = method.invoke(object, new Object[0]);
            ret = (String)object;
        }
        catch(ClassNotFoundException cnfe)
        {
            ret = name;
        }
        catch(NoSuchMethodException nsme)
        {
            ret = name;
        }
        catch(IllegalAccessException ia)
        {
            ret = name;
        }
        catch(InvocationTargetException ita)
        {
            ret = fallback;
            System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
        }
        return ret;
    }

    private void writeObject(ObjectOutputStream out)
        throws IOException
    {
        if(null != getConnection())
        {
            out.writeBoolean(true);
            out.writeInt(mSource.offset());
            String href = getUrl();
            out.writeObject(href);
            setUrl(getConnection().getURL().toExternalForm());
            Source source = getSource();
            mSource = null;
            PageIndex index = mIndex;
            mIndex = null;
            out.defaultWriteObject();
            mSource = source;
            mIndex = index;
        } else
        {
            out.writeBoolean(false);
            String href = getUrl();
            out.writeObject(href);
            setUrl(null);
            out.defaultWriteObject();
            setUrl(href);
        }
    }

    private void readObject(ObjectInputStream in)
        throws IOException, ClassNotFoundException
    {
        boolean fromurl = in.readBoolean();
        if(fromurl)
        {
            int offset = in.readInt();
            String href = (String)in.readObject();
            in.defaultReadObject();
            if(null != getUrl())
            {
                URL url = new URL(getUrl());
                try
                {
                    setConnection(url.openConnection());
                }
                catch(ParserException pe)
                {
                    throw new IOException(pe.getMessage());
                }
            }
            Cursor cursor = new Cursor(this, 0);
            for(int i = 0; i < offset; i++)
                try
                {
                    getCharacter(cursor);
                }
                catch(ParserException pe)
                {
                    throw new IOException(pe.getMessage());
                }

            setUrl(href);
        } else
        {
            String href = (String)in.readObject();
            in.defaultReadObject();
            setUrl(href);
        }
    }

    public void reset()
    {
        getSource().reset();
        mIndex = new PageIndex(this);
    }

    public void close()
        throws IOException
    {
        if(null != getSource())
            getSource().destroy();
    }

    protected void finalize()
        throws Throwable
    {
        close();
    }

    public URLConnection getConnection()
    {
        return mConnection;
    }

    public void setConnection(URLConnection connection)
        throws ParserException
    {
        mConnection = connection;
        mConnection.setConnectTimeout(6000);
        mConnection.setReadTimeout(6000);
        try
        {
            getConnection().connect();
        }
        catch(UnknownHostException uhe)
        {
            throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
        }
        catch(IOException ioe)
        {
            throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
        }
        String type = getContentType();
        String charset = getCharset(type);
        try
        {
            String contentEncoding = connection.getContentEncoding();
            System.out.println("contentEncoding="+contentEncoding);
            Stream stream;
            if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
                stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
            else
            if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
                stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
            else{
                stream = new Stream(getConnection().getInputStream());
            }

            try
            {
		        /*
            	 * 时间:2010年8月6日
            	 * 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下
            	 */
            	if(charset.indexOf("ISO-8859-1")!=-1){

            		charset =getGaoBinDEFAULT_CHARSET() ;

            	}
		mSource = new InputStreamSource(stream, charset);
            }
            catch(UnsupportedEncodingException uee)
            {
                charset = "ISO-8859-1";
                mSource = new InputStreamSource(stream, charset);
            }
        }
        catch(IOException ioe)
        {
            throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
        }
        mUrl = connection.getURL().toExternalForm();
		mIndex = new PageIndex(this);
    }

    public String getUrl()
    {
        return mUrl;
    }

    public void setUrl(String url)
    {
        mUrl = url;
    }

    public String getBaseUrl()
    {
        return mBaseUrl;
    }

    public void setBaseUrl(String url)
    {
        mBaseUrl = url;
    }

    public Source getSource()
    {
        return mSource;
    }

    public String getContentType()
    {
        String ret = "text/html";
        URLConnection connection = getConnection();
        if(null != connection)
        {
            String content = connection.getHeaderField("Content-Type");
            if(null != content)
                ret = content;
        }
        return ret;
    }

    public char getCharacter(Cursor cursor)
        throws ParserException
    {
        int i = cursor.getPosition();
        int offset = mSource.offset();
        char ret;
        if(offset == i)
            try
            {
                i = mSource.read();
                if(-1 == i)
                {
                    ret = '\uFFFF';
                } else
                {
                    ret = (char)i;
                    cursor.advance();
                }
            }
            catch(IOException ioe)
            {
                throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
            }
        else
        if(offset > i)
        {
            try
            {
                ret = mSource.getCharacter(i);
            }
            catch(IOException ioe)
            {
                throw new ParserException("can't read a character at position " + i, ioe);
            }
            cursor.advance();
        } else
        {
            throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
        }
        if('\r' == ret)
        {
            ret = '\n';
            if(mSource.offset() == cursor.getPosition())
                try
                {
                    i = mSource.read();
                    if(-1 != i)
                        if('\n' == (char)i)
                            cursor.advance();
                        else
                            try
                            {
                                mSource.unread();
                            }
                            catch(IOException ioe)
                            {
                                throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
                            }
                }
                catch(IOException ioe)
                {
                    throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
                }
            else
                try
                {
                    if('\n' == mSource.getCharacter(cursor.getPosition()))
                        cursor.advance();
                }
                catch(IOException ioe)
                {
                    throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
                }
        }
        if('\n' == ret)
            mIndex.add(cursor);
        return ret;
    }

    public void ungetCharacter(Cursor cursor)
        throws ParserException
    {
        cursor.retreat();
        int i = cursor.getPosition();
        try
        {
            char ch = mSource.getCharacter(i);
            if('\n' == ch && 0 != i)
            {
                ch = mSource.getCharacter(i - 1);
                if('\r' == ch)
                    cursor.retreat();
            }
        }
        catch(IOException ioe)
        {
            throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
        }
    }

    public String getEncoding()
    {
        return getSource().getEncoding();
    }

    public void setEncoding(String character_set)
        throws ParserException
    {
    	Page.GaoBinDEFAULT_CHARSET = character_set;
        getSource().setEncoding(character_set);
    }

    public URL constructUrl(String link, String base)
        throws MalformedURLException
    {
        return constructUrl(link, base, false);
    }

    public URL constructUrl(String link, String base, boolean strict)
        throws MalformedURLException
    {
        int index;
        URL url;
        if(!strict && '?' == link.charAt(0))
        {
            if(-1 != (index = base.lastIndexOf('?')))
                base = base.substring(0, index);
            url = new URL(base + link);
        } else
        {
            url = new URL(new URL(base), link);
        }
        String path = url.getFile();
        boolean modified = false;
        boolean absolute = link.startsWith("/");
        if(!absolute)
            do
            {
                if(!path.startsWith("/."))
                    break;
                if(path.startsWith("/../"))
                {
                    path = path.substring(3);
                    modified = true;
                    continue;
                }
                if(!path.startsWith("/./") && !path.startsWith("/."))
                    break;
                path = path.substring(2);
                modified = true;
            } while(true);
        while(-1 != (index = path.indexOf("/\\"))) 
        {
            path = path.substring(0, index + 1) + path.substring(index + 2);
            modified = true;
        }
        if(modified)
            url = new URL(url, path);
        return url;
    }

    public String getAbsoluteURL(String link)
    {
        return getAbsoluteURL(link, false);
    }

    public String getAbsoluteURL(String link, boolean strict)
    {
        String ret;
        if(null == link || "".equals(link))
            ret = "";
        else
            try
            {
                String base = getBaseUrl();
                if(null == base)
                    base = getUrl();
                if(null == base)
                {
                    ret = link;
                } else
                {
                    URL url = constructUrl(link, base, strict);
                    ret = url.toExternalForm();
                }
            }
            catch(MalformedURLException murle)
            {
                ret = link;
            }
        return ret;
    }

    public int row(Cursor cursor)
    {
        return mIndex.row(cursor);
    }

    public int row(int position)
    {
        return mIndex.row(position);
    }

    public int column(Cursor cursor)
    {
        return mIndex.column(cursor);
    }

    public int column(int position)
    {
        return mIndex.column(position);
    }

    public String getText(int start, int end)
        throws IllegalArgumentException
    {
        String ret;
        try
        {
            ret = mSource.getString(start, end - start);
        }
        catch(IOException ioe)
        {
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        }
        return ret;
    }

    public void getText(StringBuffer buffer, int start, int end)
        throws IllegalArgumentException
    {
        if(mSource.offset() < start || mSource.offset() < end)
            throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
        int length;
        if(end < start)
        {
            length = end;
            end = start;
            start = length;
        }
        length = end - start;
        try
        {
            mSource.getCharacters(buffer, start, length);
        }
        catch(IOException ioe)
        {
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        }
    }

    public String getText()
    {
        return getText(0, mSource.offset());
    }

    public void getText(StringBuffer buffer)
    {
        getText(buffer, 0, mSource.offset());
    }

    public void getText(char array[], int offset, int start, int end)
        throws IllegalArgumentException
    {
        if(mSource.offset() < start || mSource.offset() < end)
            throw new IllegalArgumentException("attempt to extract future characters from source");
        int length;
        if(end < start)
        {
            length = end;
            end = start;
            start = length;
        }
        length = end - start;
        try
        {
            mSource.getCharacters(array, offset, start, end);
        }
        catch(IOException ioe)
        {
            throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
        }
    }

    public String getLine(Cursor cursor)
    {
        int line = row(cursor);
        int size = mIndex.size();
        int start;
        int end;
        if(line < size)
        {
            start = mIndex.elementAt(line);
            if(++line <= size)
                end = mIndex.elementAt(line);
            else
                end = mSource.offset();
        } else
        {
            start = mIndex.elementAt(line - 1);
            end = mSource.offset();
        }
        return getText(start, end);
    }

    public String getLine(int position)
    {
        return getLine(new Cursor(this, position));
    }

    public String toString()
    {
        String ret;
        if(mSource.offset() > 0)
        {
            StringBuffer buffer = new StringBuffer(43);
            int start = mSource.offset() - 40;
            if(0 > start)
                start = 0;
            else
                buffer.append("...");
            getText(buffer, start, mSource.offset());
            ret = buffer.toString();
        } else
        {
            ret = super.toString();
        }
        return ret;
    }

    public static final String DEFAULT_CHARSET = "ISO-8859-1";
    public static String GaoBinDEFAULT_CHARSET;
	public static final String DEFAULT_CONTENT_TYPE = "text/html";
    public static final char EOF = 65535;
    protected String mUrl;
    protected String mBaseUrl;
    protected Source mSource;
    protected PageIndex mIndex;
    protected transient URLConnection mConnection;
    protected static ConnectionManager mConnectionManager = new ConnectionManager();

    public static String getGaoBinDEFAULT_CHARSET() {
		return GaoBinDEFAULT_CHARSET;
	}

	public static void setGaoBinDEFAULT_CHARSET(String gaoBinDEFAULT_CHARSET) {
		GaoBinDEFAULT_CHARSET = gaoBinDEFAULT_CHARSET;
	}

}


package org.htmlparser.lexer;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.UnsupportedEncodingException;

import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.ParserException;

public class InputStreamSource
    extends
        Source
{
    /**
     * An initial buffer size.
     * Has a default value of {16384}.
     */
    public static int BUFFER_SIZE = 16384;

    /**
     * The stream of bytes.
     * Set to <code>null</code> when the source is closed.
     */
    protected transient InputStream mStream;

    /**
     * The character set in use.
     */
    protected String mEncoding;

    /**
     * The converter from bytes to characters.
     */
    protected transient InputStreamReader mReader;

    /**
     * The characters read so far.
     */
    protected char[] mBuffer;

    /**
     * The number of valid bytes in the buffer.
     */
    protected int mLevel;

    /**
     * The offset of the next byte returned by read().
     */
    protected int mOffset;
    /**
     * The bookmark.
     */
    protected int mMark;

    /**
     * Create a source of characters using the default character set.
     * @param stream The stream of bytes to use.
     * @exception UnsupportedEncodingException If the default character set
     * is unsupported.
     */
    public InputStreamSource (InputStream stream)
        throws
            UnsupportedEncodingException
    {
        this (stream, null, BUFFER_SIZE);
    }

    public InputStreamSource (InputStream stream, String charset)
        throws
            UnsupportedEncodingException
    {
        this (stream, charset, BUFFER_SIZE);
    }

    /**
     * Create a source of characters.
     * @param stream The stream of bytes to use.
     * @param charset The character set used in encoding the stream.
     * @param size The initial character buffer size.
     * @exception UnsupportedEncodingException If the character set
     * is unsupported.
     */
    public InputStreamSource (InputStream stream, String charset, int size)
        throws
            UnsupportedEncodingException
    {
        if (null == stream)
            stream = new Stream (null);
        else
            // bug #1044707 mark()/reset() issues
            if (!stream.markSupported ())
                // wrap the stream so we can reset
                stream = new Stream (stream);
        mStream = stream;
        if (null == charset)
        {
            mReader = new InputStreamReader (stream);
            mEncoding = mReader.getEncoding ();
        }
        else
        {
            mEncoding = charset;
            mReader = new InputStreamReader (stream, charset);
        }
        mBuffer = new char[size];
        mLevel = 0;
        mOffset = 0;
        mMark = -1;
    }
  /**
     * Serialization support.
     * @param out Where to write this object.
     * @exception IOException If serialization has a problem.
     */
    private void writeObject (ObjectOutputStream out)
        throws
            IOException
    {
        int offset;
        char[] buffer;

        if (null != mStream)
        {
            // remember the offset, drain the input stream, restore the offset
            offset = mOffset;
            buffer = new char[4096];
            while (EOF != read (buffer))
                ;
            mOffset = offset;
        }

        out.defaultWriteObject ();
    }

    /**
     * Deserialization support.
     * @param in Where to read this object from.
     * @exception IOException If deserialization has a problem.
     */
    private void readObject (ObjectInputStream in)
        throws
            IOException,
            ClassNotFoundException
    {
        in.defaultReadObject ();
        if (null != mBuffer) // buffer is null when destroy's been called
            // pretend we're open, mStream goes null when exhausted
            mStream = new ByteArrayInputStream (new byte[0]);
    }

    /**
     * Get the input stream being used.
     * @return The current input stream.
     */
    public InputStream getStream ()
    {
        return (mStream);
    }

    /**
     * Get the encoding being used to convert characters.
     * @return The current encoding.
     */
    public String getEncoding ()
    {
        return (mEncoding);
    }

    /**
     * Begins reading from the source with the given character set.
     * If the current encoding is the same as the requested encoding,
     * this method is a no-op. Otherwise any subsequent characters read from
     * this page will have been decoded using the given character set.<p>
     * Some magic happens here to obtain this result if characters have already
     * been consumed from this source.
     * Since a Reader cannot be dynamically altered to use a different character
     * set, the underlying stream is reset, a new Source is constructed
     * and a comparison made of the characters read so far with the newly
     * read characters up to the current position.
     * If a difference is encountered, or some other problem occurs,
     * an exception is thrown.
     * @param character_set The character set to use to convert bytes into
     * characters.
     * @exception ParserException If a character mismatch occurs between
     * characters already provided and those that would have been returned
     * had the new character set been in effect from the beginning. An
     * exception is also thrown if the underlying stream won't put up with
     * these shenanigans.
     */
    public void setEncoding (String character_set)
        throws
            ParserException
    {   
        String encoding;
        InputStream stream;
        char[] buffer;
        int offset;
        char[] new_chars;

        encoding = getEncoding ();
        if(encoding!=null){
        	character_set=encoding;
        }
        if (!encoding.equalsIgnoreCase (character_set))
        {
            stream = getStream ();
            try
            {
                buffer = mBuffer;
                offset = mOffset;
                stream.reset ();
                try
                {
                    mEncoding = character_set;
                    mReader = new InputStreamReader (stream, character_set);
                    mBuffer = new char[mBuffer.length];
                    mLevel = 0;
                    mOffset = 0;
                    mMark = -1;
                    if (0 != offset)
                    {
                        new_chars = new char[offset];
                        if (offset != read (new_chars))
                            throw new ParserException ("reset stream failed");
                        for (int i = 0; i < offset; i++)
                            if (new_chars[i] != buffer[i])
                                throw new EncodingChangeException ("character mismatch (new: "
                                + new_chars[i]
                                + " [0x"
                                + Integer.toString (new_chars[i], 16)
                                + "] != old: "
                                + " [0x"
                                + Integer.toString (buffer[i], 16)
                                + buffer[i]
                                + "]) for encoding change from "
                                + encoding
                                + " to "
                                + character_set
                                + " at character offset "
                                + i);
                    }
                }
                catch (IOException ioe)
                {
                    throw new ParserException (ioe.getMessage (), ioe);
                }
            }
            catch (IOException ioe)
            {   // bug #1044707 mark()/reset() issues
                throw new ParserException ("Stream reset failed ("
                    + ioe.getMessage ()
                    + "), try wrapping it with a org.htmlparser.lexer.Stream",
                    ioe);
            }
        }
    }

    /**
     * Fetch more characters from the underlying reader.
     * Has no effect if the underlying reader has been drained.
     * @param min The minimum to read.
     * @exception IOException If the underlying reader read() throws one.
     */
    protected void fill (int min)
        throws
            IOException
    {
        char[] buffer;
        int size;
        int read;

        if (null != mReader) // mReader goes null when it's been sucked dry
        {
            size = mBuffer.length - mLevel; // available space
            if (size < min) // oops, better get some buffer space
            {
                // unknown length... keep doubling
                size = mBuffer.length * 2;
                read = mLevel + min;
                if (size < read) // or satisfy min, whichever is greater
                    size = read;
                else
                    min = size - mLevel; // read the max
                buffer = new char[size];
            }
            else
            {
                buffer = mBuffer;
                min = size;
            }

            // read into the end of the 'new' buffer
            read = mReader.read (buffer, mLevel, min);
            if (EOF == read)
            {
                mReader.close ();
                mReader = null;
            }
            else
            {
                if (mBuffer != buffer)
                {   // copy the bytes previously read
                    System.arraycopy (mBuffer, 0, buffer, 0, mLevel);
                    mBuffer = buffer;
                }
                mLevel += read;
            }
            // todo, should repeat on read shorter than original min
        }
    }
    /**
     * Does nothing.
     * It's supposed to close the source, but use destroy() instead.
     * @exception IOException <em>not used</em>
     * @see #destroy
     */
    public void close () throws IOException
    {
    }

    /**
     * Read a single character.
     * This method will block until a character is available,
     * an I/O error occurs, or the end of the stream is reached.
     * @return The character read, as an integer in the range 0 to 65535
     * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has
     * been reached
     * @exception IOException If an I/O error occurs.
     */
    public int read () throws IOException
    {
        int ret;

        if (mLevel - mOffset < 1)
        {
            if (null == mStream)
                throw new IOException ("source is closed");
            fill (1);
            if (mOffset >= mLevel)
                ret = EOF;
            else
                ret = mBuffer[mOffset++];
        }
        else
            ret = mBuffer[mOffset++];

        return (ret);
    }

    /**
     * Read characters into a portion of an array.  This method will block
     * until some input is available, an I/O error occurs, or the end of the
     * stream is reached.
     * @param cbuf Destination buffer
     * @param off Offset at which to start storing characters
     * @param len Maximum number of characters to read
     * @return The number of characters read, or {@link #EOF EOF} if the end of
     * the stream has been reached
     * @exception IOException If an I/O error occurs.
     */
    public int read (char[] cbuf, int off, int len) throws IOException
    {
        int ret;

        if (null == mStream)
            throw new IOException ("source is closed");
        if ((null == cbuf) || (0 > off) || (0 > len))
            throw new IOException ("illegal argument read ("
                + ((null == cbuf) ? "null" : "cbuf")
                + ", " + off + ", " + len + ")");
        if (mLevel - mOffset < len)
            fill (len - (mLevel - mOffset)); // minimum to satisfy this request
        if (mOffset >= mLevel)
            ret = EOF;
        else
        {
            ret = Math.min (mLevel - mOffset, len);
            System.arraycopy (mBuffer, mOffset, cbuf, off, ret);
            mOffset += ret;
        }

        return (ret);
    }

    /**
     * Read characters into an array.
     * This method will block until some input is available, an I/O error occurs,
     * or the end of the stream is reached.
     * @param cbuf Destination buffer.
     * @return The number of characters read, or {@link #EOF EOF} if the end of
     * the stream has been reached.
     * @exception IOException If an I/O error occurs.
     */
    public int read (char[] cbuf) throws IOException
    {
        return (read (cbuf, 0, cbuf.length));
    }

    /**
     * Reset the source.
     * Repositions the read point to begin at zero.
     * @exception IllegalStateException If the source has been closed.
     */
    public void reset ()
        throws
            IllegalStateException
    {
        if (null == mStream)
            throw new IllegalStateException ("source is closed");
        if (-1 != mMark)
            mOffset = mMark;
        else
            mOffset = 0;
    }

    /**
     * Tell whether this source supports the mark() operation.
     * @return <code>true</code>.
     */
    public boolean markSupported ()
    {
        return (true);
    }

    /**
     * Mark the present position in the source.
     * Subsequent calls to {@link #reset()}
     * will attempt to reposition the source to this point.
     * @param  readAheadLimit <em>Not used.</em>
     * @exception IOException If the source is closed.
     *
     */
    public void mark (int readAheadLimit) throws IOException
    {
        if (null == mStream)
            throw new IOException ("source is closed");
        mMark = mOffset;
    }

    /**
     * Tell whether this source is ready to be read.
     * @return <code>true</code> if the next read() is guaranteed not to block
     * for input, <code>false</code> otherwise.
     * Note that returning false does not guarantee that the next read will block.
     * @exception IOException If the source is closed.
     */
    public boolean ready () throws IOException
    {
        if (null == mStream)
            throw new IOException ("source is closed");
        return (mOffset < mLevel);
    }

    /**
     * Skip characters.
     * This method will block until some characters are available,
     * an I/O error occurs, or the end of the stream is reached.
     * <em>Note: n is treated as an int</em>
     * @param n The number of characters to skip.
     * @return The number of characters actually skipped
     * @exception IllegalArgumentException If <code>n</code> is negative.
     * @exception IOException If an I/O error occurs.
     */
    public long skip (long n)
        throws
            IOException,
            IllegalArgumentException
    {
        long ret;

        if (null == mStream)
            throw new IOException ("source is closed");
        if (0 > n)
            throw new IllegalArgumentException ("cannot skip backwards");
        else
        {
            if (mLevel - mOffset < n)
                fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request
            if (mOffset >= mLevel)
                ret = EOF;
            else
            {
                ret = Math.min (mLevel - mOffset, n);
                mOffset += ret;
            }
        }

        return (ret);
    }
   /**
     * Undo the read of a single character.
     * @exception IOException If the source is closed or no characters have
     * been read.
     */
    public void unread () throws IOException
    {
        if (null == mStream)
            throw new IOException ("source is closed");
        if (0 < mOffset)
            mOffset--;
        else
            throw new IOException ("can't unread no characters");
    }

    /**
     * Retrieve a character again.
     * @param offset The offset of the character.
     * @return The character at <code>offset</code>.
     * @exception IOException If the offset is beyond {@link #offset()} or the
     * source is closed.
     */
    public char getCharacter (int offset) throws IOException
    {
        char ret;

        if (null == mStream)
            throw new IOException ("source is closed");
        if (offset >= mBuffer.length)
            throw new IOException ("illegal read ahead");
        else
            ret = mBuffer[offset];

        return (ret);
    }

    /**
     * Retrieve characters again.
     * @param array The array of characters.
     * @param offset The starting position in the array where characters are to be placed.
     * @param start The starting position, zero based.
     * @param end The ending position
     * (exclusive, i.e. the character at the ending position is not included),
     * zero based.
     * @exception IOException If the start or end is beyond {@link #offset()}
     * or the source is closed.
     */
    public void getCharacters (char[] array, int offset, int start, int end) throws IOException
    {
        if (null == mStream)
            throw new IOException ("source is closed");
        System.arraycopy (mBuffer, start, array, offset, end - start);
    }

    /**
     * Retrieve a string.
     * @param offset The offset of the first character.
     * @param length The number of characters to retrieve.
     * @return A string containing the <code>length</code> characters at <code>offset</code>.
     * @exception IOException If the offset or (offset + length) is beyond
     * {@link #offset()} or the source is closed.
     */
    public String getString (int offset, int length) throws IOException
    {
        String ret;

        if (null == mStream)
            throw new IOException ("source is closed");
        if (offset + length > mBuffer.length)
            throw new IOException ("illegal read ahead");
        else
            ret = new String (mBuffer, offset, length);

        return (ret);
    }

    /**
     * Append characters already read into a <code>StringBuffer</code>.
     * @param buffer The buffer to append to.
     * @param offset The offset of the first character.
     * @param length The number of characters to retrieve.
     * @exception IOException If the offset or (offset + length) is beyond
     * {@link #offset()} or the source is closed.
     */
    public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException
    {
        if (null == mStream)
            throw new IOException ("source is closed");
        buffer.append (mBuffer, offset, length);
    }

    /**
     * Close the source.
     * Once a source has been closed, further {@link #read() read},
     * {@link #ready ready}, {@link #mark mark}, {@link #reset reset},
     * {@link #skip skip}, {@link #unread unread},
     * {@link #getCharacter getCharacter} or {@link #getString getString}
     * invocations will throw an IOException.
     * Closing a previously-closed source, however, has no effect.
     * @exception IOException If an I/O error occurs
     */
    public void destroy () throws IOException
    {
        mStream = null;
        if (null != mReader)
            mReader.close ();
        mReader = null;
        mBuffer = null;
        mLevel = 0;
        mOffset = 0;
        mMark = -1;
    }

    /**
     * Get the position (in characters).
     * @return The number of characters that have already been read, or
     * {@link #EOF EOF} if the source is closed.
     */
    public int offset ()
    {
        int ret;

        if (null == mStream)
            ret = EOF;
        else
            ret = mOffset;

        return (ret);
    }

    /**
     * Get the number of available characters.
     * @return The number of characters that can be read without blocking or
     * zero if the source is closed.
     */
    public int available ()
    {
        int ret;

        if (null == mStream)
            ret = 0;
        else
            ret = mLevel - mOffset;

        return (ret);
    }
}

会飞的五花肉

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
htmlparser的编码问题

htmlparser在提取网站内容时，有时会出现乱码或者是编码不能转换的问题。这是htmlparser的一个小bug，因为htmlparser作为一个开源软件已经很长时间没有更新了。org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) fo...
复制链接

扫一扫