字符编码理解

最新推荐文章于 2020-09-18 13:53:52 发布

南南北北

最新推荐文章于 2020-09-18 13:53:52 发布

阅读量592

点赞数

分类专栏： j2ee 文章标签： string byte html servlet input eclipse

本文链接：https://blog.csdn.net/norwolfli/article/details/1724791

版权

j2ee 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

public static void main(String[] args) throws UnsupportedEncodingException {

		String str1 = "中";

		/* 1、String str = "中"; str1存入字符串池中，java的编码UNICODE编码形式存储/U4e2d 

		 * 2、byte[] bGBK = str.getBytes("GBK"); 读出字符串str1(/U4e2d),并按照GBK编码，返回GBK编码下"中"字对应的字节数组。bGBK={d6,d0};     

		 * 3、byte[] bUTF8 = str.getBytes("UTF-8");  得到"中"字在UTF-8编码下的数组 bUTF8={e4,b8,ad}; 

		 * 4、String strGBK = new String(bGBK,"GBK"); 将bGBK按照"GBK"解码，得到d6d0在"GBK"编码下的字---"中"; * 

		 * 5、String strUTF8 = new String(bUTF8,"UTF-8"); 将bUTF8按照"UTF-8"解码，得到e4b8ad对应的"UTF-8"编码下的字---"中"; * 

		 * 6、乱码String strError = new String(bGBK,"UTF-8");  将bGBK--d6d0按UTF-8解码，得到UTF-8编码下的d6d0对应的字，不一定是一个字了，可能两个，可能是"?"(无法识别时为"?")

		 * 7、web会出现乱码，是因为网络上以字节数组传递数据，容器接收到请求后，会根据其默认的字符编码将byte[]转换成字符串，就可能出现6中所提到的现象。再转回去就可以了，String strRight = new String(strError.getBytes("容器默认编码")，"请求中的字符编码");

		 */



		byte[] b = str1.getBytes("GBK");

		for (int i = 0; i < b.length; i++) {

			System.out.println(">>>>" + Integer.toHexString(b[i] & 0xff));// 得到str1对应的GBK编码数组。

		}

		byte[] b1 = str1.getBytes("UTF-8");

		for (int i = 0; i < b1.length; i++) {

			System.out.println("<<<<" + Integer.toHexString(b1[i] & 0xff));// 得到str1对应的UTF-8编码数组。

		}

		String str2 = new String(str1.getBytes("GBK"), "UTF-8");

		System.out.println(str1);

		System.out.println(str2);

	}

<%@ page language="java" contentType="text/html; charset=UTF-8"

    pageEncoding="UTF-8"%>

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html>

<head>

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">



<form action="Web2Servlet" method="get">

	<input type="text" name="username" value="中国" />

	<input type="submit" value="提交">

</form>

protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

		String str = request.getParameter("username");

		System.out.println("str= "+str);

		byte[] b2 = str.getBytes("UNICODE");

		byte[] b3 = new byte[3];

		b3[0] = b2[3];

		b3[1] = b2[5];

		b3[2] = b2[7];

		for(int i=0;i<b3.length;i++)

			System.out.println("b3["+i+"]="+Integer.toHexString(b3[i]&0xff));

		String test = new String(b3,"UTF-8");

		System.out.println("test: "+test);

		for(int i=0;i<b2.length;i++)

			System.out.println("b2["+i+"]="+Integer.toHexString(b2[i]&0xff));



	}  	

tomcat未指定URIEncoding=utf-8时，tomcat将get提交按UNICODE进行编码。
打印结果：
str= ??????
b3[0]=e4
b3[1]=b8
b3[2]=ad
test: 中
b2[0]=fe
b2[1]=ff
b2[2]=0
b2[3]=e4
b2[4]=0
b2[5]=b8
b2[6]=0
b2[7]=ad
b2[8]=0
b2[9]=e5
b2[10]=0
b2[11]=9b
b2[12]=0
b2[13]=bd

ps:在eclipse下，直接new server更改URIEncoding的时候，需要在eclipse的server工程中修改。

<%@ page language="java" contentType="text/html; charset=UTF-8"

    pageEncoding="UTF-8"%>

<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<html>

<head>

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">



<form action="Web2Servlet" method="post">

	<input type="text" name="username" value="中国" />

	<input type="submit" value="提交">

</form>


protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

		request.setCharacterEncoding("UTF-8");

		System.out.println("this is post method");

		String str = request.getParameter("username");

		System.out.println("str= "+str);

}

如果指定request.setCharacterEncoding("UTF-8");则正常显示。所以一般处理乱码，使用过滤器。
如果不指定request.setCharacterEncoding("UTF-8");那么显示时和get方法提交是一样的，也就是
tomcat使用了UNICODE对字节流进行编码。

ps:过滤器和request.setCharacterEncoding("UTF-8");只对消息体有作用，对消息头无效。