抖音短视频数据抓取实战系列(四)——Java解析Json文件数据并存库
项目目录
1、抖音短视频数据抓取实战系列(〇)——前言
2、抖音短视频数据抓取实战系列(一)——模拟器的选择与设置
3、抖音短视频数据抓取实战系列(二)——Fiddler安装配置以及模拟器监测环境配置
4、抖音短视频数据抓取实战系列(三)——Fiddler抓取抖音用户详细信息数据
5、抖音短视频数据抓取实战系列(四)——Java解析Json文件数据并存库
6、抖音短视频数据抓取实战系列(五)——Mitmproxy的安装以及模拟器Mitmproxy证书安装
7、抖音短视频数据抓取实战系列(六)——Mitmproxy+python编写监测程序
8、抖音短视频数据抓取实战系列(七)——python连接MySQL数据库
9、抖音短视频数据抓取实战系列(八)——Mitmproxy抓取用户详细信息并入库
10、抖音短视频数据抓取实战系列(九)——自动化Appium的环境与参数配置
11、抖音短视频数据抓取实战系列(十)——获取抖音dom元素属性
12、抖音短视频数据抓取实战系列(十一)——Appium与Mitmproxy联合-自动取存抖音用户信息
13、抖音短视频数据抓取实战系列(十二)——抓取实战BUG总集
1、用户数据成功保存到本地json文件之后,接下来我们解析json文件并将数据入库了,不过在此之前我们需要对数据进行分析,存储我们所需要的数据信息,数据库构造相关的操作这里就不多做讲述了。
{
"extra":{
"fatal_item_ids":[
],
"logid":"2020121816574701019806019933018921",
"now":1608281867000
},
"log_pb":{
"impr_id":"2020121816574701019806019933018921"
},
"status_code":0,
"user":{
"shop_micro_app":"",
"urge_detail":{
"user_urged":0
},
"live_commerce":true,
"signature_language":"zh",
"secret":0,
"apple_account":0,
"user_not_show":0,
"follower_count":15088831,
"is_block":false,
"avatar_168x168":{
"width":720,
"height":720,
"uri":"tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf",
"url_list":[
"https://p1-dy-ipv6.byteimg.com/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~tplv-dy-shrink:188:188.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=true&sh=188_188&sc=avatar&l=2020121816574701019806019933018921",
"https://p29-dy.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_168x168.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p5-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_168x168.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_168x168.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p29-dy.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_168x168.jpeg?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921"
]
},
"general_permission":{
"following_follower_list_toast":1
},
"location":"深圳",
"white_cover_url":[
{
"uri":"31ac700025de5c2cf06c4",
"url_list":[
"https://p3-dy-ipv6.byteimg.com/31ac700025de5c2cf06c4~tplv-dy-shrink:750:422.jpeg?from=2480802190&s=profile&se=true&sh=750_422&sc=cover&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/obj/31ac700025de5c2cf06c4?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p26-dy.byteimg.com/obj/31ac700025de5c2cf06c4?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p6-dy-ipv6.byteimg.com/obj/31ac700025de5c2cf06c4?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921"
]
},
{
"uri":"318f1000413827e122102",
"url_list":[
"https://p3-dy-ipv6.byteimg.com/318f1000413827e122102~tplv-dy-shrink:750:422.jpeg?from=2480802190&s=profile&se=true&sh=750_422&sc=cover&l=2020121816574701019806019933018921",
"https://p29-dy.byteimg.com/obj/318f1000413827e122102?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/obj/318f1000413827e122102?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p6-dy-ipv6.byteimg.com/obj/318f1000413827e122102?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921"
]
}
],
"room_id":0,
"video_icon":{
"uri":"",
"url_list":[
],
"width":720,
"height":720
},
"total_favorited":249455424,
"with_commerce_entry":true,
"youtube_channel_id":"",
"dongtai_count":328,
"city":"深圳",
"is_star":false,
"follow_status":0,
"following_count":308,
"unique_id":"CC233333",
"r_fans_group_info":{
},
"with_commerce_enterprise_tab_entry":false,
"commerce_user_info":{
"has_ads_entry":false,
"ad_revenue_rits":null,
"star_atlas":1,
"show_star_atlas_cooperation":true
},
"video_cover":{
},
"uid":"58948149403",
"aweme_count":323,
"mplatform_followers_count":15321101,
"sync_to_toutiao":1,
"commerce_user_level":0,
"ins_id":"",
"tab_settings":{
"private_tab":{
"show_private_tab":false,
"private_tab_style":1
}
},
"show_favorite_list":true,
"signature":"围博:蔡萝莉S
商务:CCSW__(两个下划线)",
"favoriting_count":75749,
"message_chat_entry":true,
"gender":2,
"verification_type":0,
"youtube_channel_title":"",
"is_blocked":false,
"sec_uid":"MS4wLjABAAAAGAvitlc9VZrB4NQZDLPgTCKVL11-j8iMUFvk3ywnRpA",
"is_mix_user":false,
"short_id":"0",
"avatar_thumb":{
"width":720,
"height":720,
"uri":"tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf",
"url_list":[
"https://p9-dy.byteimg.com/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~tplv-dy-shrink:188:188.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=true&sh=188_188&sc=avatar&l=2020121816574701019806019933018921",
"https://p5-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_100x100.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_100x100.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_100x100.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p5-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_100x100.jpeg?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921"
]
},
"original_musician":{
"music_count":0,
"music_used_count":0,
"digg_count":0
},
"follower_status":0,
"profile_tab_type":0,
"is_effect_artist":false,
"birthday_hide_level":0,
"school_name":"",
"share_info":{
"bool_persist":1,
"share_url":"www.iesdouyin.com/share/user/58948149403?sec_uid=MS4wLjABAAAAGAvitlc9VZrB4NQZDLPgTCKVL11-j8iMUFvk3ywnRpA&did=3887519960166408&iid=1125546753005591",
"share_weibo_desc":"在抖音,记录美好生活!",
"share_desc":"在抖音,记录美好生活!",
"share_title":"快来加入抖音,让你发现最有趣的我!",
"share_qrcode_url":{
"uri":"216a0019bc825a6bec6b",
"url_list":[
"https://p3-dy-ipv6.byteimg.com/obj/216a0019bc825a6bec6b",
"https://p1-dy-ipv6.byteimg.com/obj/216a0019bc825a6bec6b",
"https://p9-dy.byteimg.com/obj/216a0019bc825a6bec6b"
]
},
"share_image_url":{
"url_list":null
}
},
"is_activity_user":false,
"cover_colour":"#03C23B37",
"twitter_name":"",
"iso_country_code":"CN",
"forward_count":5,
"enterprise_user_info":"{"commerce_info":{"offline_info_list":[],"challenge_list":[],"task_list":null,"head_image_list":null,"smart_phone_list":null},"homepage_bottom_toast":[],"permissions":[{"Id":5,"Key":"UserShop","Name":"个人橱窗","AppId":1128,"Status":1,"Extra":null,"Customization":null,"Parent":0,"Actions":null},{"Id":3,"Key":"ItemShop","Name":"视频电商","AppId":1128,"Status":1,"Extra":null,"Customization":null,"Parent":0,"Actions":null},{"Id":4,"Key":"LiveShop","Name":"直播电商","AppId":1128,"Status":1,"Extra":null,"Customization":null,"Parent":0,"Actions":null}]}",
"recommend_reason_relation":"",
"watch_status":false,
"is_gov_media_vip":false,
"country":"中国",
"nickname":"蔡萝莉🍒",
"birthday":"2001-01-01",
"with_new_goods":false,
"district":"",
"recommend_user_reason_source":0,
"followers_detail":[
{
"download_url":"https://d.douyin.com/JsvN/",
"package_name":"com.ss.android.ugc.aweme",
"app_name":"aweme",
"name":"抖音",
"icon":"http://p3.pstatp.com/origin/50ec00079b64de2050dc",
"fans_count":15088831,
"open_url":"snssdk1128://user/profile/58948149403?",
"apple_id":"1142110895"
},
{
"download_url":"https://d.toutiao.com/YjjY/",
"package_name":"com.ss.android.article.news",
"app_name":"news_article",
"name":"头条",
"icon":"http://p3.pstatp.com/origin/50ed00079a1b6b8d1fb1",
"fans_count":108406,
"open_url":"snssdk143://profile?uid=82586091539",
"apple_id":"529092160"
},
{
"open_url":"snssdk1112://profile?id=4503658575519899",
"apple_id":"1086047750",
"download_url":"http://d.huoshanzhibo.com/eFvB/",
"package_name":"com.ss.android.ugc.live",
"app_name":"live_stream",
"name":"抖音火山版",
"icon":"http://p3.pstatp.com/origin/2ea5c000abe106154adef",
"fans_count":123864
}
],
"commerce_info":{
"offline_info_list":[
],
"challenge_list":[
],
"task_list":null,
"head_image_list":null,
"smart_phone_list":null
},
"custom_verify":"",
"enterprise_verify_reason":"",
"with_fusion_shop_entry":true,
"twitter_id":"",
"cover_url":[
{
"uri":"31ac700025de5c2cf06c4",
"url_list":[
"https://p3-dy-ipv6.byteimg.com/31ac700025de5c2cf06c4~tplv-dy-shrink:750:422.jpeg?from=2480802190&s=profile&se=true&sh=750_422&sc=cover&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/obj/31ac700025de5c2cf06c4?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p26-dy.byteimg.com/obj/31ac700025de5c2cf06c4?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p6-dy-ipv6.byteimg.com/obj/31ac700025de5c2cf06c4?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921"
]
},
{
"uri":"c8510002be9a3a61aad2",
"url_list":[
"https://p29-dy.byteimg.com/c8510002be9a3a61aad2~tplv-dy-shrink:750:422.jpeg?from=2480802190&s=profile&se=true&sh=750_422&sc=cover&l=2020121816574701019806019933018921",
"https://p29-dy.byteimg.com/obj/c8510002be9a3a61aad2?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/obj/c8510002be9a3a61aad2?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921",
"https://p6-dy-ipv6.byteimg.com/obj/c8510002be9a3a61aad2?from=2480802190&s=profile&se=false&sh=&sc=cover&l=2020121816574701019806019933018921"
]
}
],
"avatar_larger":{
"uri":"tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf",
"url_list":[
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_1080x1080.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_1080x1080.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p9-dy.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_1080x1080.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_1080x1080.jpeg?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921"
],
"width":720,
"height":720
},
"avatar_medium":{
"uri":"tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf",
"url_list":[
"https://p29-dy.byteimg.com/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~tplv-dy-shrink:188:188.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=true&sh=188_188&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_720x720.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p3-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_720x720.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p29-dy.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_720x720.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_720x720.jpeg?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921"
],
"width":720,
"height":720
},
"avatar_300x300":{
"width":720,
"height":720,
"uri":"tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf",
"url_list":[
"https://p26-dy.byteimg.com/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~tplv-dy-shrink:188:188.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=true&sh=188_188&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_300x300.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p5-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_300x300.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p29-dy.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_300x300.webp?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921",
"https://p1-dy-ipv6.byteimg.com/img/tos-cn-avt-0015/5b04405f0f23e0eae6958ae083a571cf~c5_300x300.jpeg?from=2956013662&s=PackSourceEnum_USER_PROFILE&se=false&sh=&sc=avatar&l=2020121816574701019806019933018921"
]
},
"province":"广东"
}
}
2、然后我们对之前存储数据的json文件进行优化,将格式转化为UTF-8,博主使用notePad++进行转化的,下载地址:NotePad++ Download,优化之后内容如下
3、java解析json文件并进行数据库操作,核心代码:
数据库操作代码:
package com.example.data.mapper;
import com.example.data.pojo.BiUser;
import com.example.data.pojo.DouYin;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Select;
import java.util.ArrayList;
@Mapper
//DouYinMapper数据库连接层
public interface DouYinMapper {
@Insert("insert into douyin (favorited,following,followers,aweme_count,favoriting,dongtai,gender,unique_id," +
"sec_uid,uid,nickname,enterprise,signature,img,phone,country,city,province,location,create_time,update_time,birthday,share_link) " +
"values (#{favorited},#{following},#{followers},#{aweme_count},#{favoriting},#{dongtai},#{gender},#{unique_id}," +
"#{sec_uid},#{uid},#{nickname},#{enterprise},#{signature},#{img},#{phone},#{country},#{city},#{province},#{location}," +
"#{create_time},#{update_time},#{birthday},#{share_link})")
public void insertUer(DouYin douYin);
@Select("select * from douyin limit #{param1},#{param2}")
public ArrayList<DouYin> queryUsers(int page, int num);
@Select("select * from douyin where uid=#{uid}")
public DouYin queryUser(String uid);
@Select("delete from douyin")
public void deleteAll();
}
package com.example.data.service;
import com.example.data.mapper.DouYinMapper;
import com.example.data.pojo.DouYin;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Select;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.ArrayList;
@Service("douYinService")
//服务层
public class DouYinService {
@Resource(name="douYinMapper")
DouYinMapper douYinMapper;
public DouYinMapper getDouYinMapper() {
return douYinMapper;
}
public void setDouYinMapper(DouYinMapper douYinMapper) {
this.douYinMapper = douYinMapper;
}
public void insertUer(DouYin douYin){
douYinMapper.insertUer(douYin);
}
public ArrayList<DouYin> queryUsers(int page, int num){
return douYinMapper.queryUsers((page-1)*num,num);
}
public DouYin queryUser(String uid){
return douYinMapper.queryUser(uid);
}
public void deleteAll(){
douYinMapper.deleteAll();
}
}
在线图片下载代码:
package com.example.data.service;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.UUID;
@Service("apiService")
public class ApiService {
//注入httpClient连接管理器
@Resource
private PoolingHttpClientConnectionManager cm;
public PoolingHttpClientConnectionManager getCm() {
return cm;
}
public void setCm(PoolingHttpClientConnectionManager cm) {
this.cm = cm;
}
public String getHtml(String url) {
//获取HttpClient对象
CloseableHttpClient httpClient= HttpClients.custom().setConnectionManager(cm).build();
//声明httpGet请求对象
HttpGet httpGet=new HttpGet(url);
//设置用户代理
httpGet.setHeader("User-Agent","");
//设置请求参数RequestConfig
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response=null;
try{
//使用HttpClient发起请求,返回response
response=httpClient.execute(httpGet);
//解析返回数据
if(response.getStatusLine().getStatusCode()==200){
String html="";
//如果response.getEntity()获取结果为空,在进行EntityUtils操作会报错
//需要对结果非不非空进行判断
if(response.getEntity()!=null){
html= EntityUtils.toString(response.getEntity(),"UTF-8");
}
return html;
}
}catch (Exception e){
e.printStackTrace();
}finally {
try{
if(response!=null){
//关闭连接
response.close();
}
//不能关闭,使用的是连接管理器
/*httpClient.close();*/
}catch(Exception e){
e.printStackTrace();
}
}
return null;
}
public String getImage(String url){
//获取HttpClient对象
CloseableHttpClient httpClient= HttpClients.custom().setConnectionManager(cm).build();
//声明httpGet请求对象
HttpGet httpGet=new HttpGet(url);
//设置用户代理
httpGet.setHeader("User-Agent","");
//设置请求参数RequestConfig
httpGet.setConfig(this.getConfig());
CloseableHttpResponse response=null;
try {
//使用HttpClient发起请求,返回response
response=httpClient.execute(httpGet);
//解析返回图片
if(response.getStatusLine().getStatusCode()==200){
String contentType =response.getEntity().getContentType().getValue();
//获取图片类型
String extName="."+contentType.split("/")[1];
//随机生成图片名
String imgName= UUID.randomUUID().toString()+extName;
//输出文件位置
OutputStream outputStream=new FileOutputStream(new File("E:/Spring/database/src/main/resources/image/douyin/"+imgName));
//使用相应体输出图片
response.getEntity().writeTo(outputStream);
//返回图片名
return imgName;
}
}catch (Exception e){
e.printStackTrace();
}finally {
try{
if(response!=null){
//关闭连接
response.close();
}
//不能关闭,使用的是连接管理器
/*httpClient.close();*/
}catch (Exception e){
e.printStackTrace();
}
}
return null;
}
private RequestConfig getConfig(){//获取请求对象参数
RequestConfig config=RequestConfig.custom().setConnectTimeout(1000)//设置创建连接的超时时间
.setConnectionRequestTimeout(500)//设置获取连接的超时时间
.setSocketTimeout(10000)//设置连接的超时时间
.build();
return config;
}
}
核心程序,json解析功能代码:
package com.example.data.service;
import com.example.data.DataApplication;
import com.example.data.pojo.DouYin;
import net.sf.json.JSONArray;
import org.junit.Test;
import net.sf.json.JSONObject;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import javax.annotation.Resource;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Date;
@RunWith(SpringJUnit4ClassRunner.class)
@SpringBootTest(classes= DataApplication.class)
public class DouYinServiceTest {
@Resource(name="apiService")
ApiService apiService;
public ApiService getApiService() {
return apiService;
}
public void setApiService(ApiService apiService) {
this.apiService = apiService;
}
@Resource(name="douYinService")
DouYinService douYinService;
public DouYinService getDouYinService() {
return douYinService;
}
public void setDouYinService(DouYinService douYinService) {
this.douYinService = douYinService;
}
@Test
public void delete(){
douYinService.deleteAll();
}
@Test
public void testCrawler(){
String jsonStr="";
try {
File jsonFile = new File("C:\\Users\\86187\\Desktop\\json\\result7.json");
FileReader fileReader = new FileReader(jsonFile);
Reader reader = new InputStreamReader(new FileInputStream(jsonFile),"utf-8");
int ch = 0;
StringBuffer sb = new StringBuffer();
while ((ch = reader.read()) != -1) {
sb.append((char) ch);
}
fileReader.close();
reader.close();
jsonStr = sb.toString();
JSONArray jsonArray=JSONObject.fromObject(jsonStr).getJSONArray("result");
int num=0;
for(int i=0;i<jsonArray.size();i++){
DouYin dy=getUser(jsonArray.getJSONObject(i));
if(dy==null){
num++;
System.out.println("第"+num+"重复数据");
}else {
douYinService.insertUer(dy);
}
System.out.println("************** 已完成 "+(i+1)+" / "+jsonArray.size()+" **********************");
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("-----------任务已完成-------------");
}
private DouYin getUser(JSONObject o){
DouYin dy=new DouYin();
try{
String uid=o.getJSONObject("user").has("uid")?o.getJSONObject("user").getString("uid"):"";
if(douYinService.queryUser(uid)!=null){
return null;
}
Integer favorited=o.getJSONObject("user").has("total_favorited")?o.getJSONObject("user").getInt("total_favorited"):0;
Integer following=o.getJSONObject("user").has("following_count")?o.getJSONObject("user").getInt("following_count"):0;
Integer followers=o.getJSONObject("user").has("mplatform_followers_count")?o.getJSONObject("user").getInt("mplatform_followers_count"):0;
Integer aweme_count=o.getJSONObject("user").has("aweme_count")?o.getJSONObject("user").getInt("aweme_count"):0;
Integer favoriting=o.getJSONObject("user").has("favoriting_count")?o.getJSONObject("user").getInt("favoriting_count"):0;
Integer dongtai=o.getJSONObject("user").has("dongtai_count")?o.getJSONObject("user").getInt("dongtai_count"):0;
Integer gender=o.getJSONObject("user").has("gender")?o.getJSONObject("user").getInt("gender"):0;
String unique_id=o.getJSONObject("user").has("unique_id")?o.getJSONObject("user").getString("unique_id"):"";
String sec_uid=o.getJSONObject("user").has("sec_uid")?o.getJSONObject("user").getString("sec_uid"):"";
String nickname=o.getJSONObject("user").has("nickname")?o.getJSONObject("user").getString("nickname"):"";
String enterprise=o.getJSONObject("user").has("enterprise_verify_reason")?o.getJSONObject("user").getString("enterprise_verify_reason"):"";
String signature=o.getJSONObject("user").has("signature")?o.getJSONObject("user").getString("signature"):"";
String img="";
if(o.getJSONObject("user").has("avatar_medium")){
if(o.getJSONObject("user").getJSONObject("avatar_medium").has("url_list")){
String imgurl=o.getJSONObject("user").getJSONObject("avatar_medium").getJSONArray("url_list").get(0).toString();
img=getCarImage(imgurl);
}
}
String phone="";
if(o.getJSONObject("user").has("commerce_info")) {
if (o.getJSONObject("user").getJSONObject("commerce_info").has("offline_info_list")) {
JSONArray offline_info_list = o.getJSONObject("user").getJSONObject("commerce_info").getJSONArray("offline_info_list");
for (int i = 0; i < offline_info_list.size(); i++) {
if (offline_info_list.getJSONObject(i).getString("text").equals("联系电话")) {
phone = offline_info_list.getJSONObject(i).getString("action");
}
}
}
}
String country=o.getJSONObject("user").has("country")?o.getJSONObject("user").getString("country"):"";
String city=o.getJSONObject("user").has("city")?o.getJSONObject("user").getString("city"):"";
String province=o.getJSONObject("user").has("province")?o.getJSONObject("user").getString("province"):"";
String location=o.getJSONObject("user").has("location")?o.getJSONObject("user").getString("location"):"";
String birthday=o.getJSONObject("user").has("birthday")?o.getJSONObject("user").getString("birthday"):"";
String share_link="https://www.iesdouyin.com/share/user/"+uid+"?sec_uid="+sec_uid;
Date date=new Date();
SimpleDateFormat simpleDateFormat=new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
String time=simpleDateFormat.format(date);
String create_time=time;
String update_time=time;
dy.setAweme_count(aweme_count);
dy.setBirthday(birthday);
dy.setCity(city);
dy.setCountry(country);
dy.setCreate_time(create_time);
dy.setDongtai(dongtai);
dy.setEnterprise(enterprise);
dy.setFavorited(favorited);
dy.setFavoriting(favoriting);
dy.setFollowers(followers);
dy.setFollowing(following);
dy.setGender(gender);
dy.setImg(img);
dy.setLocation(location);
dy.setNickname(nickname);
dy.setPhone(phone);
dy.setSec_uid(sec_uid);
dy.setProvince(province);
dy.setSignature(signature);
dy.setUid(uid);
dy.setShare_link(share_link);
dy.setUnique_id(unique_id);
dy.setUpdate_time(update_time);
return dy;
}catch (Exception e){
e.printStackTrace();
return null;
}
}
private String getCarImage(String val){
String imgName=apiService.getImage(val);
return imgName;
}
}
需要特别注意的是:o.getJSONObject(“user”).has(“province”),抖音返回的json数据字段可能不一样,所以我们需要先判断这个字段是否存在,只有存在才获取它的值反之设置默认值。
4、通过java解析json文件之后,json文件里的用户信息即可存入数据库,正如下图所示。
5、至此,通过Fiddler监测并抓取用户信息数据已经完成,接下来,我们使用python+Mitmproxy进行数据抓取。
项目目录
1、抖音短视频数据抓取实战系列(〇)——前言
2、抖音短视频数据抓取实战系列(一)——模拟器的选择与设置
3、抖音短视频数据抓取实战系列(二)——Fiddler安装配置以及模拟器监测环境配置
4、抖音短视频数据抓取实战系列(三)——Fiddler抓取抖音用户详细信息数据
5、抖音短视频数据抓取实战系列(四)——Java解析Json文件数据并存库
6、抖音短视频数据抓取实战系列(五)——Mitmproxy的安装以及模拟器Mitmproxy证书安装
7、抖音短视频数据抓取实战系列(六)——Mitmproxy+python编写监测程序
8、抖音短视频数据抓取实战系列(七)——python连接MySQL数据库
9、抖音短视频数据抓取实战系列(八)——Mitmproxy抓取用户详细信息并入库
10、抖音短视频数据抓取实战系列(九)——自动化Appium的环境与参数配置
11、抖音短视频数据抓取实战系列(十)——获取抖音dom元素属性
12、抖音短视频数据抓取实战系列(十一)——Appium与Mitmproxy联合-自动取存抖音用户信息
13、抖音短视频数据抓取实战系列(十二)——抓取实战BUG总集