爬取网页中遇到的编码问题-CSDN博客

利用httpclient爬取网页的过程中，需要根据网页的编码来进行爬取，而我们需要的网页编码是html的meta标签中conten-type属性中的charset字段定义的，因此为了防止乱码，需要获取charset字段中的编码方式。
解决思路：[color=red]先按照默认方式将网页源码爬取下来，并存入byte型数组中；之后用findCharset方法，利用正则表达式获得meta标签中的编码；最后利用byte数组和获得的编码重新构造String对象（即最后的网页源代码）
若网页源码中无编码定义，我们需要获得http协议头的编码定义。
[/color]

具体代码如下：


//默认编码
	public static String Default_charet = null; 

	public static String findCharSet(String html){

		//得到的网页编码
		String charset = Default_charet;

		String regex = "<meta.* charset.*>|<META.* charset.*>";
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(html);
		while(m.find()){
			String s = m.group();
			if(s.matches(".*charset.*")){
				try{
					int startindex = s.lastIndexOf("charset=")+"charset=".length();
//					System.out.println(startindex);
					int endindex = startindex;
					while(s.charAt(endindex) != '"')
						endindex++;
//					System.out.println(endindex);
					charset = s.substring(startindex,endindex);
				}catch(IndexOutOfBoundsException e){
					System.out.println("Encoding error ! ");
				}		
			}
		}
		return charset;
	}

	public static byte[] crawl(String url){
//		System.out.println(url);
		//存储网页内容
		byte[] result = null;

		//创建HttpClientBuilder
        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
        //HttpClient
        CloseableHttpClient closeableHttpClient = httpClientBuilder.build();

        HttpGet httpGet = new HttpGet(url);



        try {

            //设置请求和链接超时
        	RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(100000).setConnectTimeout(100000).build();
          	httpGet.setConfig(requestConfig);


        	//执行get请求
          	HttpResponse httpResponse = closeableHttpClient.execute(httpGet);          		

           [color=red] //获取响应消息实体
            HttpEntity entity = httpResponse.getEntity();
            //获得响应的http协议头中包含字符编码的部分
            String content_type = entity.getContentType().toString();
            int  index = content_type.toLowerCase().lastIndexOf("charset=")+"charset=".length();
            Default_charet = content_type.substring(index);
[/color]            
            //响应状态
            if(httpResponse.getStatusLine().getStatusCode() == HttpStatus.SC_OK){

	            //判断响应实体是否为空
	            if (entity != null) {
	            	try{
	            		result = EntityUtils.toByteArray(entity);
	            	}catch(SocketTimeoutException e){}
//	                
	            }
            }
        }catch (IOException e) {
            e.printStackTrace();
        }finally {

            try {
                //关闭流并释放资源
                closeableHttpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return result;
    }

	public static String downPage(String url){

		String regex = "[\\S]*\\.xls|[\\S]*\\.jpg|[\\S]*\\.doc|[\\S]*\\.docx"+
				"|[\\S]*\\.rar|[\\S]*\\.pdf|[\\S]*\\.css|[\\S]*\\.exe"+
				"|[\\S]*\\.JPG|[\\S]*\\.ppt|[\\S]*\\.PPT|[\\S]*\\.wmv";
		if(url.matches(regex))
			return null;

		//存储网页内容
		String result = null;

		//获得网页内容，并根据得到的编码重新构建String对象
		byte []b = crawl(url);

		if(b==null)
			return null;
		String s = new String(b);
//		System.out.println(s);
        try {
			result = new String(b,findCharSet(s));
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

        return result;
	}