获取网页的编码方式

最新推荐文章于 2023-06-21 08:29:13 发布

weixin_30820077

最新推荐文章于 2023-06-21 08:29:13 发布

阅读量506

点赞数 1

文章标签： java

原文链接：http://www.cnblogs.com/wylwyl/p/10889276.html

版权

package com.tl.spider.download;

import com.tl.spider.utils.StaticValue;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @ClassName CharsetDetectorUtil
 * @Description 给定一个url，获取其网页源代码的编码
 * @Author Administrator
 * @Date 2019/5/19 13:57
 * @Version 1.0
 **/
public class CharsetDetectorUtil {

    /**
     * 获取网页源代码的编码方式
     * @param url
     * @return
     * @throws IOException
     */
    public static String getCharset(String url) throws Exception {

        String charSet = null;
        URL urlObject = new URL(url);
        URLConnection urlConnection = urlObject.openConnection();
        Map<String, List<String>> map = urlConnection.getHeaderFields();
        List<String> list = map.get("Content-Type");
        if(list != null && !list.isEmpty()) {
            String line = list.get(0);
            String[] array = line.split(";");
            for(String str : array) {
                String[] eleArray = str.split("=");
                if(eleArray.length == 2) {
                   if(eleArray[0].equals("charset")) {
                       charSet = eleArray[1].trim();
                   }
                }

            }
        }

        /**
         * 由于网页的编码方式的说明只在网页源代码的前几行，所以不需要获取所有的网页源代码
         */
        if(charSet == null) {
            // 启用meta获取网页的编码方式
            BufferedReader bufferedReader = WebPageDownLoadUtil.getBR(url, StaticValue.ENCODING_DEFAULT);
            String tmp = null;
            while((tmp = bufferedReader.readLine()) != null) {
                tmp = tmp.toLowerCase();

                String charset = getCharSetValue4Line(tmp);
                if(charset != null) {
                    charSet = charset;
                    break;
                }

                if(tmp.contains("</head>")) {
                    break;
                }

            }

            if(bufferedReader != null) {
                bufferedReader.close();
            }

        }
        return charSet;
    }

    /**
     *
     * @param line
     * @return
     */
    public static String getCharSetValue4Line(String line) {
        String charsetValue = null;
        String regex = "charset=\"?(.+?)\"?\\s?/?>"; // 这个地方需要综合多个网页进行相应的修改
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(line);
        if(matcher.find()) {
            charsetValue = matcher.group(1);
        }
        return charsetValue;
    }
    public static void main(String[] args) throws Exception {
        String url = "http://news.youth.cn/";
        //String url = "https://www.baidu.com/";
        //String url= "https://hao.360.com/?s0001";
        String charSet = getCharset(url);
        System.out.println(charSet);
    }
}

其中getBr函数为：

    /**
     * 获取BufferedReader
     * @param url
     * @param charset
     * @return
     * @throws Exception
     */
    public static BufferedReader getBR(String url, String charset) throws Exception {
        URL urlObject = new URL(url);
        InputStream inputStream = urlObject.openStream();
        InputStreamReader inputStreamReader = new InputStreamReader(inputStream, charset);
        BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
        return bufferedReader;
    }

转载于:https://www.cnblogs.com/wylwyl/p/10889276.html

weixin_30820077

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
获取网页的编码方式

package com.tl.spider.download;import com.tl.spider.utils.StaticValue;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamRead...
复制链接

扫一扫