尝试获取页面的字符编码

最新推荐文章于 2019-11-07 10:13:03 发布

ckcs49

最新推荐文章于 2019-11-07 10:13:03 发布

阅读量1.8k

点赞数

分类专栏： java 文章标签： import exception null byte string class

本文链接：https://blog.csdn.net/ckcs49/article/details/1808897

版权

java 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

 
   
   
  /**/ 
  /*
 * EncodeGoter.java
 *
 * Created on 2007年9月30日, 下午4:49
 *
 * To change this template, choose Tools | Template Manager
 * and open the template in the editor.
 */ 
  

 
  package 
   com.ckcs.url;

 
  import 
   java.io.BufferedInputStream;
 
  import 
   java.io.ByteArrayOutputStream;
 
  import 
   java.io.InputStream;
 
  import 
   java.net.URL;
 
  import 
   java.net.URLConnection;
 
  import 
   java.nio.charset.Charset;
 
  import 
   java.util.regex.Matcher;
 
  import 
   java.util.regex.Pattern;

 
  /** */ 
  /**
 *
 * @author admin
 */ 
  
 
  public 
    
  class 
   EncodeGoter  
  ... 
  {
    
    /** *//**
     * Creates a new instance of EncodeGoter
     */
    public EncodeGoter() ...{
    }
    
    /** *//**
     * 获得页面的字符编码
     */
    private String getEncode(String size) throws Exception...{
        URL url = new URL(size);
        String charset = null;
        Pattern pattern = Pattern.compile("charset.*=.*>?", Pattern.CASE_INSENSITIVE);
        URLConnection con = url.openConnection();
        String contentType = con.getContentType(); //先尝试从http响应头获取字符编码       
        charset = doGetEncode(pattern, contentType);
        if(charset == null) ...{  //如果得不到，尝试从页面的元数据信息上获取
            InputStream is = url.openStream();
            BufferedInputStream bis = new BufferedInputStream(is);
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            int count = 0;
            byte[] bytes = new byte[1024];
            while((count = bis.read(bytes)) != -1) ...{  //每次读1024把字符截断了怎么办
                bos.write(bytes, 0, count);
                bos.flush();
                charset = doGetEncode(pattern, bos.toString());
                if(charset != null) ...{  //找到编码
                    break;
                }
                bos.reset();
            }
        }
        return charset;
    }
    
    /** *//**
     * 读取页面数据匹配模式
     */
    private String doGetEncode(Pattern pattern, String str) throws Exception...{
        Matcher matcher = null;
        String matchStr = null;
        String charset = null;
        matcher = pattern.matcher(str);
        if(matcher.find()) ...{  //找到第一个符合要求的
            matchStr = matcher.group();
            //截取希望处理的字符串,替换可能的特殊符号
            charset = matchStr.substring(matchStr.indexOf("=") + 1).replaceAll("["|/|/|/s].*[/>|>]", "");            
        }
        return charset;
    }
    
    public static void main(String[] args) throws Exception ...{
        EncodeGoter eg = new EncodeGoter();
//        eg.getEncode("http://java.sun.com");
//        eg.getEncode("http://www.sun.com");
//        eg.getEncode("http://www.csdn.net");
//        eg.getEncode("http://www.dmoz.org/");
//        eg.getEncode("http://www.baidu.com/search/image_recommend.html");
        String charset =  eg.getEncode("http://java.sun.com");
        if (charset != null) ...{
            System.out.println("页面的字符编码应该为：" + charset);
        } else ...{
            charset = Charset.defaultCharset().toString();  //使用默认编码
            System.out.println("找不到页面字符编码，平台默认编码为：" + charset);
        }
        
    }
} 
  
 
 

ckcs49

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
4
评论
尝试获取页面的字符编码

/**//* * EncodeGoter.java * * Created on 2007年9月30日, 下午4:49 * * To change this template, choose Tools | Template Manager * and open the template in the editor. */package com.ckcs.url;import java.io.
复制链接

扫一扫