htmlparser 乱码总结2

13 篇文章 0 订阅
    package org.htmlparser.lexer;  
    import java.io.ByteArrayInputStream;  
    import java.io.IOException;  
    import java.io.InputStream;  
    import java.io.InputStreamReader;  
    import java.io.ObjectInputStream;  
    import java.io.ObjectOutputStream;  
    import java.io.UnsupportedEncodingException;  
      
    import org.htmlparser.util.EncodingChangeException;  
    import org.htmlparser.util.ParserException;  
      
    public class InputStreamSource  
        extends  
            Source  
    {  
        /**
         * An initial buffer size.
         * Has a default value of {16384}.
         */  
        public static int BUFFER_SIZE = 16384;  
      
        /**
         * The stream of bytes.
         * Set to <code>null</code> when the source is closed.
         */  
        protected transient InputStream mStream;  
      
        /**
         * The character set in use.
         */  
        protected String mEncoding;  
      
        /**
         * The converter from bytes to characters.
         */  
        protected transient InputStreamReader mReader;  
      
        /**
         * The characters read so far.
         */  
        protected char[] mBuffer;  
      
        /**
         * The number of valid bytes in the buffer.
         */  
        protected int mLevel;  
      
        /**
         * The offset of the next byte returned by read().
         */  
        protected int mOffset;  
        /**
         * The bookmark.
         */  
        protected int mMark;  
      
        /**
         * Create a source of characters using the default character set.
         * @param stream The stream of bytes to use.
         * @exception UnsupportedEncodingException If the default character set
         * is unsupported.
         */  
        public InputStreamSource (InputStream stream)  
            throws  
                UnsupportedEncodingException  
        {  
            this (stream, null, BUFFER_SIZE);  
        }  
      
        public InputStreamSource (InputStream stream, String charset)  
            throws  
                UnsupportedEncodingException  
        {  
            this (stream, charset, BUFFER_SIZE);  
        }  
      
        /**
         * Create a source of characters.
         * @param stream The stream of bytes to use.
         * @param charset The character set used in encoding the stream.
         * @param size The initial character buffer size.
         * @exception UnsupportedEncodingException If the character set
         * is unsupported.
         */  
        public InputStreamSource (InputStream stream, String charset, int size)  
            throws  
                UnsupportedEncodingException  
        {  
            if (null == stream)  
                stream = new Stream (null);  
            else  
                // bug #1044707 mark()/reset() issues  
                if (!stream.markSupported ())  
                    // wrap the stream so we can reset  
                    stream = new Stream (stream);  
            mStream = stream;  
            if (null == charset)  
            {  
                mReader = new InputStreamReader (stream);  
                mEncoding = mReader.getEncoding ();  
            }  
            else  
            {  
                mEncoding = charset;  
                mReader = new InputStreamReader (stream, charset);  
            }  
            mBuffer = new char[size];  
            mLevel = 0;  
            mOffset = 0;  
            mMark = -1;  
        }  
      /**
         * Serialization support.
         * @param out Where to write this object.
         * @exception IOException If serialization has a problem.
         */  
        private void writeObject (ObjectOutputStream out)  
            throws  
                IOException  
        {  
            int offset;  
            char[] buffer;  
      
            if (null != mStream)  
            {  
                // remember the offset, drain the input stream, restore the offset  
                offset = mOffset;  
                buffer = new char[4096];  
                while (EOF != read (buffer))  
                    ;  
                mOffset = offset;  
            }  
      
            out.defaultWriteObject ();  
        }  
      
        /**
         * Deserialization support.
         * @param in Where to read this object from.
         * @exception IOException If deserialization has a problem.
         */  
        private void readObject (ObjectInputStream in)  
            throws  
                IOException,  
                ClassNotFoundException  
        {  
            in.defaultReadObject ();  
            if (null != mBuffer) // buffer is null when destroy's been called  
                // pretend we're open, mStream goes null when exhausted  
                mStream = new ByteArrayInputStream (new byte[0]);  
        }  
      
        /**
         * Get the input stream being used.
         * @return The current input stream.
         */  
        public InputStream getStream ()  
        {  
            return (mStream);  
        }  
      
        /**
         * Get the encoding being used to convert characters.
         * @return The current encoding.
         */  
        public String getEncoding ()  
        {  
            return (mEncoding);  
        }  
      
        /**
         * Begins reading from the source with the given character set.
         * If the current encoding is the same as the requested encoding,
         * this method is a no-op. Otherwise any subsequent characters read from
         * this page will have been decoded using the given character set.<p>
         * Some magic happens here to obtain this result if characters have already
         * been consumed from this source.
         * Since a Reader cannot be dynamically altered to use a different character
         * set, the underlying stream is reset, a new Source is constructed
         * and a comparison made of the characters read so far with the newly
         * read characters up to the current position.
         * If a difference is encountered, or some other problem occurs,
         * an exception is thrown.
         * @param character_set The character set to use to convert bytes into
         * characters.
         * @exception ParserException If a character mismatch occurs between
         * characters already provided and those that would have been returned
         * had the new character set been in effect from the beginning. An
         * exception is also thrown if the underlying stream won't put up with
         * these shenanigans.
         */  
        public void setEncoding (String character_set)  
            throws  
                ParserException  
        {     
            String encoding;  
            InputStream stream;  
            char[] buffer;  
            int offset;  
            char[] new_chars;  
      
            encoding = getEncoding ();  
            if(encoding!=null){  
                character_set=encoding;  
            }  
            if (!encoding.equalsIgnoreCase (character_set))  
            {  
                stream = getStream ();  
                try  
                {  
                    buffer = mBuffer;  
                    offset = mOffset;  
                    stream.reset ();  
                    try  
                    {  
                        mEncoding = character_set;  
                        mReader = new InputStreamReader (stream, character_set);  
                        mBuffer = new char[mBuffer.length];  
                        mLevel = 0;  
                        mOffset = 0;  
                        mMark = -1;  
                        if (0 != offset)  
                        {  
                            new_chars = new char[offset];  
                            if (offset != read (new_chars))  
                                throw new ParserException ("reset stream failed");  
                            for (int i = 0; i < offset; i++)  
                                if (new_chars[i] != buffer[i])  
                                    throw new EncodingChangeException ("character mismatch (new: "  
                                    + new_chars[i]  
                                    + " [0x"  
                                    + Integer.toString (new_chars[i], 16)  
                                    + "] != old: "  
                                    + " [0x"  
                                    + Integer.toString (buffer[i], 16)  
                                    + buffer[i]  
                                    + "]) for encoding change from "  
                                    + encoding  
                                    + " to "  
                                    + character_set  
                                    + " at character offset "  
                                    + i);  
                        }  
                    }  
                    catch (IOException ioe)  
                    {  
                        throw new ParserException (ioe.getMessage (), ioe);  
                    }  
                }  
                catch (IOException ioe)  
                {   // bug #1044707 mark()/reset() issues  
                    throw new ParserException ("Stream reset failed ("  
                        + ioe.getMessage ()  
                        + "), try wrapping it with a org.htmlparser.lexer.Stream",  
                        ioe);  
                }  
            }  
        }  
      
        /**
         * Fetch more characters from the underlying reader.
         * Has no effect if the underlying reader has been drained.
         * @param min The minimum to read.
         * @exception IOException If the underlying reader read() throws one.
         */  
        protected void fill (int min)  
            throws  
                IOException  
        {  
            char[] buffer;  
            int size;  
            int read;  
      
            if (null != mReader) // mReader goes null when it's been sucked dry  
            {  
                size = mBuffer.length - mLevel; // available space  
                if (size < min) // oops, better get some buffer space  
                {  
                    // unknown length... keep doubling  
                    size = mBuffer.length * 2;  
                    read = mLevel + min;  
                    if (size < read) // or satisfy min, whichever is greater  
                        size = read;  
                    else  
                        min = size - mLevel; // read the max  
                    buffer = new char[size];  
                }  
                else  
                {  
                    buffer = mBuffer;  
                    min = size;  
                }  
      
                // read into the end of the 'new' buffer  
                read = mReader.read (buffer, mLevel, min);  
                if (EOF == read)  
                {  
                    mReader.close ();  
                    mReader = null;  
                }  
                else  
                {  
                    if (mBuffer != buffer)  
                    {   // copy the bytes previously read  
                        System.arraycopy (mBuffer, 0, buffer, 0, mLevel);  
                        mBuffer = buffer;  
                    }  
                    mLevel += read;  
                }  
                // todo, should repeat on read shorter than original min  
            }  
        }  
        /**
         * Does nothing.
         * It's supposed to close the source, but use destroy() instead.
         * @exception IOException <em>not used</em>
         * @see #destroy
         */  
        public void close () throws IOException  
        {  
        }  
      
        /**
         * Read a single character.
         * This method will block until a character is available,
         * an I/O error occurs, or the end of the stream is reached.
         * @return The character read, as an integer in the range 0 to 65535
         * (<tt>0x00-0xffff</tt>), or {@link #EOF EOF} if the end of the stream has
         * been reached
         * @exception IOException If an I/O error occurs.
         */  
        public int read () throws IOException  
        {  
            int ret;  
      
            if (mLevel - mOffset < 1)  
            {  
                if (null == mStream)  
                    throw new IOException ("source is closed");  
                fill (1);  
                if (mOffset >= mLevel)  
                    ret = EOF;  
                else  
                    ret = mBuffer[mOffset++];  
            }  
            else  
                ret = mBuffer[mOffset++];  
      
            return (ret);  
        }  
      
        /**
         * Read characters into a portion of an array.  This method will block
         * until some input is available, an I/O error occurs, or the end of the
         * stream is reached.
         * @param cbuf Destination buffer
         * @param off Offset at which to start storing characters
         * @param len Maximum number of characters to read
         * @return The number of characters read, or {@link #EOF EOF} if the end of
         * the stream has been reached
         * @exception IOException If an I/O error occurs.
         */  
        public int read (char[] cbuf, int off, int len) throws IOException  
        {  
            int ret;  
      
            if (null == mStream)  
                throw new IOException ("source is closed");  
            if ((null == cbuf) || (0 > off) || (0 > len))  
                throw new IOException ("illegal argument read ("  
                    + ((null == cbuf) ? "null" : "cbuf")  
                    + ", " + off + ", " + len + ")");  
            if (mLevel - mOffset < len)  
                fill (len - (mLevel - mOffset)); // minimum to satisfy this request  
            if (mOffset >= mLevel)  
                ret = EOF;  
            else  
            {  
                ret = Math.min (mLevel - mOffset, len);  
                System.arraycopy (mBuffer, mOffset, cbuf, off, ret);  
                mOffset += ret;  
            }  
      
            return (ret);  
        }  
      
        /**
         * Read characters into an array.
         * This method will block until some input is available, an I/O error occurs,
         * or the end of the stream is reached.
         * @param cbuf Destination buffer.
         * @return The number of characters read, or {@link #EOF EOF} if the end of
         * the stream has been reached.
         * @exception IOException If an I/O error occurs.
         */  
        public int read (char[] cbuf) throws IOException  
        {  
            return (read (cbuf, 0, cbuf.length));  
        }  
      
        /**
         * Reset the source.
         * Repositions the read point to begin at zero.
         * @exception IllegalStateException If the source has been closed.
         */  
        public void reset ()  
            throws  
                IllegalStateException  
        {  
            if (null == mStream)  
                throw new IllegalStateException ("source is closed");  
            if (-1 != mMark)  
                mOffset = mMark;  
            else  
                mOffset = 0;  
        }  
      
        /**
         * Tell whether this source supports the mark() operation.
         * @return <code>true</code>.
         */  
        public boolean markSupported ()  
        {  
            return (true);  
        }  
      
        /**
         * Mark the present position in the source.
         * Subsequent calls to {@link #reset()}
         * will attempt to reposition the source to this point.
         * @param  readAheadLimit <em>Not used.</em>
         * @exception IOException If the source is closed.
         *
         */  
        public void mark (int readAheadLimit) throws IOException  
        {  
            if (null == mStream)  
                throw new IOException ("source is closed");  
            mMark = mOffset;  
        }  
      
        /**
         * Tell whether this source is ready to be read.
         * @return <code>true</code> if the next read() is guaranteed not to block
         * for input, <code>false</code> otherwise.
         * Note that returning false does not guarantee that the next read will block.
         * @exception IOException If the source is closed.
         */  
        public boolean ready () throws IOException  
        {  
            if (null == mStream)  
                throw new IOException ("source is closed");  
            return (mOffset < mLevel);  
        }  
      
        /**
         * Skip characters.
         * This method will block until some characters are available,
         * an I/O error occurs, or the end of the stream is reached.
         * <em>Note: n is treated as an int</em>
         * @param n The number of characters to skip.
         * @return The number of characters actually skipped
         * @exception IllegalArgumentException If <code>n</code> is negative.
         * @exception IOException If an I/O error occurs.
         */  
        public long skip (long n)  
            throws  
                IOException,  
                IllegalArgumentException  
        {  
            long ret;  
      
            if (null == mStream)  
                throw new IOException ("source is closed");  
            if (0 > n)  
                throw new IllegalArgumentException ("cannot skip backwards");  
            else  
            {  
                if (mLevel - mOffset < n)  
                    fill ((int)(n - (mLevel - mOffset))); // minimum to satisfy this request  
                if (mOffset >= mLevel)  
                    ret = EOF;  
                else  
                {  
                    ret = Math.min (mLevel - mOffset, n);  
                    mOffset += ret;  
                }  
            }  
      
            return (ret);  
        }  
       /**
         * Undo the read of a single character.
         * @exception IOException If the source is closed or no characters have
         * been read.
         */  
        public void unread () throws IOException  
        {  
            if (null == mStream)  
                throw new IOException ("source is closed");  
            if (0 < mOffset)  
                mOffset--;  
            else  
                throw new IOException ("can't unread no characters");  
        }  
      
        /**
         * Retrieve a character again.
         * @param offset The offset of the character.
         * @return The character at <code>offset</code>.
         * @exception IOException If the offset is beyond {@link #offset()} or the
         * source is closed.
         */  
        public char getCharacter (int offset) throws IOException  
        {  
            char ret;  
      
            if (null == mStream)  
                throw new IOException ("source is closed");  
            if (offset >= mBuffer.length)  
                throw new IOException ("illegal read ahead");  
            else  
                ret = mBuffer[offset];  
              
            return (ret);  
        }  
      
        /**
         * Retrieve characters again.
         * @param array The array of characters.
         * @param offset The starting position in the array where characters are to be placed.
         * @param start The starting position, zero based.
         * @param end The ending position
         * (exclusive, i.e. the character at the ending position is not included),
         * zero based.
         * @exception IOException If the start or end is beyond {@link #offset()}
         * or the source is closed.
         */  
        public void getCharacters (char[] array, int offset, int start, int end) throws IOException  
        {  
            if (null == mStream)  
                throw new IOException ("source is closed");  
            System.arraycopy (mBuffer, start, array, offset, end - start);  
        }  
          
        /**
         * Retrieve a string.
         * @param offset The offset of the first character.
         * @param length The number of characters to retrieve.
         * @return A string containing the <code>length</code> characters at <code>offset</code>.
         * @exception IOException If the offset or (offset + length) is beyond
         * {@link #offset()} or the source is closed.
         */  
        public String getString (int offset, int length) throws IOException  
        {  
            String ret;  
      
            if (null == mStream)  
                throw new IOException ("source is closed");  
            if (offset + length > mBuffer.length)  
                throw new IOException ("illegal read ahead");  
            else  
                ret = new String (mBuffer, offset, length);  
              
            return (ret);  
        }  
      
        /**
         * Append characters already read into a <code>StringBuffer</code>.
         * @param buffer The buffer to append to.
         * @param offset The offset of the first character.
         * @param length The number of characters to retrieve.
         * @exception IOException If the offset or (offset + length) is beyond
         * {@link #offset()} or the source is closed.
         */  
        public void getCharacters (StringBuffer buffer, int offset, int length) throws IOException  
        {  
            if (null == mStream)  
                throw new IOException ("source is closed");  
            buffer.append (mBuffer, offset, length);  
        }  
      
        /**
         * Close the source.
         * Once a source has been closed, further {@link #read() read},
         * {@link #ready ready}, {@link #mark mark}, {@link #reset reset},
         * {@link #skip skip}, {@link #unread unread},
         * {@link #getCharacter getCharacter} or {@link #getString getString}
         * invocations will throw an IOException.
         * Closing a previously-closed source, however, has no effect.
         * @exception IOException If an I/O error occurs
         */  
        public void destroy () throws IOException  
        {  
            mStream = null;  
            if (null != mReader)  
                mReader.close ();  
            mReader = null;  
            mBuffer = null;  
            mLevel = 0;  
            mOffset = 0;  
            mMark = -1;  
        }  
      
        /**
         * Get the position (in characters).
         * @return The number of characters that have already been read, or
         * {@link #EOF EOF} if the source is closed.
         */  
        public int offset ()  
        {  
            int ret;  
      
            if (null == mStream)  
                ret = EOF;  
            else  
                ret = mOffset;  
      
            return (ret);  
        }  
      
        /**
         * Get the number of available characters.
         * @return The number of characters that can be read without blocking or
         * zero if the source is closed.
         */  
        public int available ()  
        {  
            int ret;  
      
            if (null == mStream)  
                ret = 0;  
            else  
                ret = mLevel - mOffset;  
      
            return (ret);  
        }  
    }  



页面的调用


    String encoding=parser.getLexer().getPage().getQICHAODEFAULT_CHARSET();
        parser.setEncoding(encoding);
















评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值