最近玩了一款社交软件same,自己用爬虫的方式爬下了里面一些频道的内容。
采用工具:
1、Fiddler 4
2、java
步骤:
主要通过两个步骤:
1、使用Fiddler 4抓取手机APP数据包。2、java编程,通过java发出http请求实现爬虫频道内容。
1、使用Fiddler 4 抓取手机APP数据包。
(1)设置Fiddler 4 (配置完后记得要重启Fiddler)
打开Fiddler, Tools-> Options
记住这个端口号是:8888
(2)设置安卓手机
首先获取PC的ip地址:命令行中输入:ipconfig,获取ip地址
下面来对Android手机进行代理设置
(3)打开手机应用,然后可以看到Fiddler 4 里的http链接。找到属于手机应用的链接。
这里我选用的app same的每日搭配频道的http请求是:https://v2.same.com/channel/1002393/senses
2、java编程,通过java发出http请求实现爬虫频道内容。
获取到的链接:https://v2.same.com/channel/1002393/senses 直接采用浏览器访问,如下图:
进一步分析:
通过java解析json数据,保存爬到的图片
java代码如下,(Github地址:[https://github.com/Keeplingshi/SameGet]):
package com.main;
public class Main {
public static void main(String[] args) {
// https://v2.same.com/channel/1032823/senses?offset=47345399
// String url="https://v2.same.com/channel/1032823/senses"; //长腿a杯频道
// String url="https://v2.same.com/channel/1032823/senses?offset=47345399"; //长腿a杯频道
// String url2="https://v2.same.com/channel/1002393/senses"; //每日搭配
String url2="https://v2.same.com/channel/1002393/senses?offset=53455743"; //每日搭配
// String url="https://v2.same.com/channel/1512177/senses"; //672频道
// String saveFolder="C:/Users/Administrator/Desktop/same长腿a杯/";
// int[] splitnum={20,100,200,300};
// SameGet.loop_read(url,saveFolder,splitnum);
String saveFolder2="F:/same/每日搭配/";
int[] splitnum2={20,100,200,300};
//
SameGet.loop_read(url2,saveFolder2,splitnum2);
// SameGet.query("每日搭配");
}
}
package com.main;
import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
public class UrlToImage {
/**
* url保存为图片
* @param destUrl url
* @param imageName 图片路径+名称
*/
public static void saveToFile(String destUrl,String imageName) {
FileOutputStream fos = null;
BufferedInputStream bis = null;
HttpURLConnection httpUrl = null;
URL url = null;
int BUFFER_SIZE = 1024;
byte[] buf = new byte[BUFFER_SIZE];
int size = 0;
try {
url = new URL(destUrl);
httpUrl = (HttpURLConnection) url.openConnection();
httpUrl.connect();
bis = new BufferedInputStream(httpUrl.getInputStream());
fos = new FileOutputStream(imageName);
while ((size = bis.read(buf)) != -1) {
fos.write(buf, 0, size);
}
fos.flush();
} catch (IOException e) {
} catch (ClassCastException e) {
} finally {
try {
fos.close();
bis.close();
httpUrl.disconnect();
} catch (IOException e) {
} catch (NullPointerException e) {
}
}
}
}
package com.main;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
public class SameGet {
/**
* same查询
* @param str
*/
public static void query(String str)
{
String url="https://v2.same.com/channel/search?query=";
String query_result=fetch_data(url+str);
JSONObject json = JSONObject.fromObject(query_result);
String data=json.get("data").toString();
JSONObject data_json = JSONObject.fromObject(data);
//same状态̬
String results_str=data_json.get("results").toString();
JSONArray all_user_json = JSONArray.fromObject(results_str);
//读取每一条same状态״̬
for(Object object:all_user_json)
{
String each_str=object.toString();
JSONObject each_json = JSONObject.fromObject(each_str);
String id_str=each_json.get("id").toString();
String name_str=each_json.get("name").toString();
String url_str="https://v2.same.com/channel/"+id_str+"/senses";
System.out.println(id_str+"\t"+name_str+"\t"+url_str);
}
}
/**
* 循环读取
* @param nexturl
* @param saveFolder
*/
public static void loop_read(String nexturl,String saveFolder,int[] splitnum) {
if(nexturl!=null&&!"".equals(nexturl))
{
String returnData=fetch_data(nexturl);
nexturl=read_data(returnData,saveFolder,splitnum);
loop_read(nexturl,saveFolder,splitnum);
}
}
/**
* 保存图片
*/
public static String read_data(String str,String saveFolder,int[] splitnum)
{
JSONObject json = JSONObject.fromObject(str);
String data=json.get("data").toString();
JSONObject data_json = JSONObject.fromObject(data);
//same状态̬
String results_str=data_json.get("results").toString();
JSONArray all_user_json = JSONArray.fromObject(results_str);
//读取每一条same状态״̬
for(Object object:all_user_json)
{
String each_str=object.toString();
JSONObject each_json = JSONObject.fromObject(each_str);
String id_str=each_json.get("id").toString();
String created_at_str=each_json.get("created_at").toString();
String txt=each_json.get("txt").toString();
int likes_str=Integer.valueOf(each_json.get("likes").toString());
String photo_str=each_json.get("photo").toString();
int index = photo_str.lastIndexOf(".");
char[] ch = photo_str.toCharArray();
String file_type = String.copyValueOf(ch, index + 1, ch.length - index - 1);
String level="";
if(likes_str<splitnum[0]){
level="a";
}else if(likes_str<splitnum[1]){
level="b";
}else if(likes_str<splitnum[2]){
level="c";
}else if(likes_str<splitnum[3]){
level="d";
}else{
level="e";
}
String filename=saveFolder+level+"/"+id_str+"_"+created_at_str+"_"+likes_str+"."+file_type;
//url转图片
UrlToImage.saveToFile(photo_str, filename);
// System.out.println(id_str);
// System.out.println(created_at_str);
// System.out.println(likes_str);
System.out.println(photo_str);
}
//下一条url指向
if(data_json.get("next")==null){
return null;
}
String next_str="https://v2.same.com"+data_json.get("next").toString();
System.out.println(next_str);
return next_str;
}
/**
* 通过url获取数据
* @param urlstr
* @return
*/
public static String fetch_data(String urlstr)
{
try{
URL url = new URL(urlstr);
HttpURLConnection urlConnection = (HttpURLConnection)url.openConnection();
urlConnection.setRequestMethod("GET");
urlConnection.connect();
InputStream inputStream = urlConnection.getInputStream();
String responseStr = ConvertToString(inputStream);
return responseStr;
}catch(IOException e){
e.printStackTrace();
}
return null;
}
/**
* 输入流转换为字符串
* @param inputStream
* @return
*/
public static String ConvertToString(InputStream inputStream){
InputStreamReader inputStreamReader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
StringBuilder result = new StringBuilder();
String line = null;
try {
while((line = bufferedReader.readLine()) != null){
result.append(line + "\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try{
inputStreamReader.close();
inputStream.close();
bufferedReader.close();
}catch(IOException e){
e.printStackTrace();
}
}
return result.toString();
}
/**
* 文件流转换为字符串
* @param inputStream
* @return
*/
public static String ConvertToString(FileInputStream inputStream){
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
StringBuilder result = new StringBuilder();
String line = null;
try {
while((line = bufferedReader.readLine()) != null){
result.append(line + "\n");
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try{
inputStreamReader.close();
inputStream.close();
bufferedReader.close();
}catch(IOException e){
e.printStackTrace();
}
}
return result.toString();
}
}
参考: Fiddler抓取手机APP数据包 https://my.oschina.net/jhao104/blog/605963