异常处理:使用UTF-8编码Mysql仍然抛出Incorrect string value异常

之前还以为从上至下统一用上UTF-8就高枕无忧了,哪知道今天在抓取新浪微博的数据的时候还是遇到字符的异常。

新浪微博返回的数据编码是unicode,而数据库用的是uft8,在转换的时候有些字符会得到4字节的utf8编码。

从新浪微博抓到的数据在入库的时候抛出异常:

Incorrect string value: '\xF0\x90\x8D\x83\xF0\x90...' 

发现导致异常的字符不是繁体而是某种佛经文字。。。额滴神。。。但是按道理UTF-8应该能支持才对啊,他不是万能的么?


原来问题出在mysql上,mysql如果设置编码集为utf8那么它最多只能支持到3个字节的UTF-8编码,而4个字节的UTF-8字符还是存在的,这样一来如果你建表的时候用的utf8字符集出异常就理所当然了。


解决方法很简单,修改字段或者表的字符集为utf8mb4。


比较蛋疼的是,字符集utf8mb4在mysql 5.5.3之后才支持。。。



package weibo4j.sina;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.BatchUpdateException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.List;

import weibo4j.entity.StockByCompanyName;
import weibo4j.model.WeiboException;
import weibo4j.org.json.JSONArray;
import weibo4j.org.json.JSONException;
import weibo4j.org.json.JSONObject;
import weibo4j.util.BareBonesBrowserLaunch;
import weibo4j.util.Cont;
import weibo4j.util.DBUtil;
import weibo4j.util.DatePaser;
import weibo4j.util.DeleteHtml;
import weibo4j.util.HavingCrawl;
import weibo4j.util.UnicodeToUTF8;
import weibo4j.util.WeiIDPaser;
import weibo4j.weiutils.Oauth;
import weibo4j.weiutils.Search;

public class Sina {


	static String label = "server1";
	static String access_token = "";
//	static long p = 0;  //返回记录的总页数
	static int count = 50;     //每页记录数,由于新浪每次只返回最新200条微博,建议此值设置为20或10的倍数,以方便后面计算;
	static int searchWaitTime =3;   //秒,每个关键词采集完后休眠时间,
	static int allSearchWaitTime = 20 ;//分,所有关键词采集完后休眠时间,
	static int successNum =0;//每批次插入数据的记录数
	static String wei_platform ="新浪微博";
	static String t_listed_company_dic = "t_listed_company_dic_sina";
	static String t_sina_weibo_text = "t_sina_weibo_text";
	public static String crawlTimeTable ="t_weibo_crawl_time_sina";//库表,用于判断某关键词微博是否已经采集过

	public static void main(String[] args) {
		Oauth oauth = new Oauth();
		try {
			BareBonesBrowserLaunch.openURL(oauth.authorize("code"));
			System.out.print("enter code:");
			BufferedReader br = new BufferedReader(new InputStreamReader(
					System.in));
			String code = br.readLine();
			access_token = oauth.getAccessTokenByCode(code).getAccessToken();
//			System.out.print("access_token");
		} catch (WeiboException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}

		Sina.doCrawl();// 采集微博

	}

	public static void doCrawl() {
		Search search = new Search();
		search.client.setToken(access_token);
		System.out.print("setToken access_token");
		
		Connection conn = null;
		PreparedStatement pstmt = null;
		
		String sql = "insert into " + t_sina_weibo_text + "(wei_id,head,nick,wei_text,timestam,daydate,fro,wei_platform,company_name,search_name,company_code,isVerified) "
				+ "values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?)";


		while (true) {
			try {
				conn = DBUtil.getConnection();
				pstmt = conn.prepareStatement(sql);
				String last_companyName_sql = "select company_name from "
						+ t_listed_company_dic
						+ " where id =(select max(id) from "
						+ t_listed_company_dic + " where label ='" + label
						+ "' and parent_id=0)";
				String lastCompanyName = Cont.get(last_companyName_sql);

				String sql_company = "select company_name,company_code,category from "
						+ t_listed_company_dic
						+ " where parent_id=0 and label='" + label + "'";
				List<StockByCompanyName> lists_companies = Cont
						.getAllCompanyNames(sql_company);
				String company_name = "";
				String company_code = "";
				String search_company = "";
				String sinceTime ="";

				for (int i = 0; i < lists_companies.size(); i++) {
					StockByCompanyName parent_company = lists_companies.get(i);// 得到公司名
					company_name = parent_company.getCompany_name();
					company_code = parent_company.getCompany_code();
					System.out.println("company_name:" + company_name + ","
							+ company_code );
					String sql_search = "select company_name from "
							+ t_listed_company_dic
							+ " where parent_id in (select id from "
							+ t_listed_company_dic + " where company_name='"
							+ company_name + "')";
					List<String> search_companies = Cont.getContent(sql_search,
							"company_name");

					for (int j = 0; j < search_companies.size(); j++) {

						search_company = search_companies.get(j);// 例如:上证综合指数等公司搜索词
//						try{
//							JSONObject ss = search.searchTopics(search_company, count,1);//调用接口获取结果数量
//							resultCount = Integer.parseInt(ss.getString("total_number"));//返回结果总数
//							if(resultCount==0){
//								break;
//							}
//						} catch (NumberFormatException e) {
//							e.printStackTrace();
//						} catch (JSONException e) {
//							e.printStackTrace();
//						} catch (WeiboException e) {
//							e.printStackTrace();
//						} 

					
//						p =  resultCount%count==0?resultCount/count:resultCount/count+1;//获取总页数

//						int allowMaxPages = 200%count==0?200/count:200/count+1;//假设所有搜索词返回记录数超200个,则理论上每个搜索词最大可采页数为allowMaxPages
						for(int a =4;a >0;a--){           //循环采集,判断可采总页数,然后倒序页号采集
							//循环采集,根据总页数和最大可采页数(新浪一次调用接口只返回最新200条)
							JSONArray myJsonArray =null;
							String timestam ="";
							boolean isUpdate = false;//是否需要更新表中关键词最后采集时间;false为不更新,true为更新
							try{
								JSONObject sss = search.searchTopics(search_company, count,a);
								
								String st = sss.getString("statuses");
								
								System.out.println("返回内容: " + st);
								System.out.println("返回内容长度: " + st.length());
								if(st.length()<10){//st为空时只返回“[]”符号
									System.out.println("搜索词" + search_company + "第 " + a +"页返回结果为空");
//									System.out.println("本页返回结果为空");
									continue;
								}
	
								myJsonArray = new JSONArray(st);
								if((myJsonArray==null) || (myJsonArray.toString()=="")){
									//发现当myJsonArray返回为空时不做处理会抛错并中断程序
									//返回的错误代码:
									/*
									 *weibo4j.model.WeiboException: A JSONObject text must begin with '{' at character 1:<html>
									 *<head><title>504 Gateway Time-out</title></head>
									 *<body bgcolor="white">
									 *<center><h1>504 Gateway Time-out</h1></center>
									 *<hr><center>nginx</center>
									 *</body>
									 *</html>
									 */
									
									continue;
								}
								
								System.out.println("开始采集搜索词" + search_company + "第 " + a +"页内容");
								//判断每页第一条微博发表时间(和表t_weibo_crawl_time对比)即可判断内容是否已采集
								if(!HavingCrawl.isCrawlByPage(search_company, myJsonArray.getJSONObject(0).getString("created_at"))){
									System.out.println("本页微博内容已经采集过!");
									continue;
								}
							}catch (JSONException e) {
								e.printStackTrace();
							} catch (WeiboException e) {
								e.printStackTrace();
							} 
							
							
//							System.out.println(myJsonArray.length());
							//返回的每页内容是按微博发表时间降序排列,故对返回的记录做倒序遍历,便于和时间表的时间对比
							String tempTime = "";
							sinceTime = Cont.getDateBySearchName("select sina_end_time from " +crawlTimeTable + " where search_name =\"" + search_company + "\"");
							for (int ii = myJsonArray.length()-1; ii >=0; ii--) {
								String wei_text = null;
								String mid = null;
								String fro =  null;
								String image =  null;
								String user_id =  null;
								String head = null;
								String nick = null;
								String isVerified = null;
								try{
									// 获取每一个JsonObject对象
									JSONObject myjObject = myJsonArray.getJSONObject(ii);
									
									// 获取每一个对象中的值
									timestam = DatePaser.getDate(myjObject.getString("created_at"));
									
									
									if(!HavingCrawl.isCrawl(sinceTime, timestam)){
										continue;//返回的某页内某条微博内容创建时间早于表中记录,则跳出循环,不插入数据库
									}else{
										tempTime = timestam;
										isUpdate =true;
									}
								
									wei_text = myjObject.getString("text");//text返回的编码格式是unicode,某些unicode字符转成utf8编码之后有4个字节,mysql 只能存放3个字节的UTF8
//									UnicodeToUTF8.unicodeToUtf8(wei_text);
									mid = myjObject.getString("mid");
									fro = DeleteHtml.htmlFilter(myjObject.getString("source"));
									image = myjObject.getString("thumbnail_pic");//缩略图片地址,没有时不返回此字段
									
									//获取数组中对象的对象
									JSONObject myjObject2 = myjObject.getJSONObject("user");
								    nick = myjObject2.getString("screen_name");
								    isVerified = myjObject2.getString("verified");
								    head = myjObject2.getString("profile_image_url");
								    user_id = myjObject2.getString("id");
								    
								}catch (JSONException e) {
									e.printStackTrace();
								} 
								
							    
								pstmt.setString(1, user_id + WeiIDPaser.getWeiID(mid));
								pstmt.setString(2, head);
								pstmt.setString(3, nick);
								pstmt.setString(4, wei_text);
								pstmt.setString(5, timestam);
								pstmt.setString(6, timestam.substring(0, 10));
								pstmt.setString(7, fro);
								pstmt.setString(8, wei_platform);
								pstmt.setString(9, company_name);// 公司名
								pstmt.setString(10, search_company);// 插入搜索名
								pstmt.setString(11, company_code);// 插入公司代码
								pstmt.setString(12, isVerified);
								pstmt.addBatch();
								try{
									successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数
									System.out.println(sql);
									System.out.println("成功插入到数据库数: " + successNum);	
									if(isUpdate){
										Cont.updateEndTime(timestam, search_company, crawlTimeTable);
										System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);
									}
								}catch(BatchUpdateException e){
									e.printStackTrace();
								}
								 
							}
//							successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数
//							System.out.println(sql);
//							System.out.println("成功插入到数据库数: " + successNum);	
//							if(isUpdate){
//								Cont.updateEndTime(timestam, search_company, crawlTimeTable);
//								System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);
//							}
							System.out.println("关键词  " + search_company + "第 " + a + "采集完毕,休眠" + searchWaitTime + "秒钟后继续");
							Thread.sleep(searchWaitTime*1000);//每个关键词采集完后休眠时间
						}
	
					}
					if (company_name.equals(lastCompanyName)) {
						System.out.println("所有关键词采集完毕,休眠" + allSearchWaitTime + "分钟后继续轮循采集...");
						Thread.sleep(allSearchWaitTime* 60 * 1000);
					}
				}
				pstmt.close();
				conn.close();
			}catch (SQLException e) {
				e.printStackTrace();
			}catch (InterruptedException e) {
				e.printStackTrace();
			}
			
		}

	}


}



之前是把下面插入表的代码放在for循环外面,由于中途碰上4字节字条抛错经常导致程序中断或没采几个关键词就中断重头来,后面把它们放在 for循环内后,才解决,把抛异常限制在for循环内


try{  
                                    successNum = pstmt.executeBatch().length;// 成功插入数据库记录条数  
                                    System.out.println(sql);  
                                    System.out.println("成功插入到数据库数: " + successNum);   
                                    if(isUpdate){  
                                        Cont.updateEndTime(timestam, search_company, crawlTimeTable);  
                                        System.out.println(crawlTimeTable + "库表时间已经更新:更新的关键词" + search_company);  
                                    }  
                                }catch(BatchUpdateException e){  
                                    e.printStackTrace();  
                                }  



  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值