因为个人需要爬取某个吧内的所有帖子并保存下来,但是无奈不会写python,于是就想到了利用java的jsoup去做爬虫,尝试了多次后终于获取成功,主要遇到的问题还是百度的反爬机制,解决了后,整理一篇文章,以便日后查阅。
通过Jsoup爬取百度贴吧数据
通过链接提取吧内所有帖子数据,并输出为.txt或者md文件
效果图:
文件内:
先不多bb直接上代码
/**
* @Author: xy
* @Date: 2021/3/1 21:10
* 爬取贴吧的所有数据,到对应的吧第一页,然后复制地址栏的地址传入createHome()方法即可
*/
@RestController
@RequestMapping(value = "xy/getTxt", produces = "text/plain;charset=utf-8")
public class JsoupController {
/**
* 贴吧首页请求头
*/
public static final Map<String, String> HOME_HEARD_MAP = new HashMap<>();
/**
* 帖子请求头
*/
public static final Map<String, String> CONTENT_HEARD_MAP = new HashMap<>();
public static Map<String, String> getHomeHeardMap() {
HOME_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
HOME_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
HOME_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
HOME_HEARD_MAP.put("Connection", "keep-alive");
if (!HOME_HEARD_MAP.containsKey("Cookie")) {
HOME_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; delPer=0; PSINO=6; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598932,1614598938,1614651172,1614656659; BCLID=6696522698307635931; BDSFRCVID=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; BCLID_BFESS=6696522698307635931; BDSFRCVID_BFESS=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e787570f1fcab17da5132635e67fb49ac3a2ee926ac26ab414b8dc1f022a26b6af0be5ec5feb08e47ecea40e7c3ac42af63418eb176202b934e8a5d65a31f7f67c9; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614661234; st_sign=46af36f3; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848b4e9bf304bec24493f1d19c3b7526fc3ce49228569294a5afc07905bed78d5368; BAIDUID_BFESS=CDDCC97066F658F1A310835A932F3477:FG=1; BA_HECTOR=04ahah0ga1a5800ko11g3ri0j0r; ZD_ENTRY=baidu; ab_sr=1.0.0_MmE3OWNlMWI1NjEwM2RiYTNmNmUwNjRiZTZiOWUxZjNhMWVjOTU5Y2ZjYzM3YTdiNWNhMTU1ZjRiZTFhNzRhZDU3NTk1Y2RkMGU4MzQyMzkzM2U4OTYzZTYxMDE3OGQ1");
}
HOME_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
return HOME_HEARD_MAP;
}
public static Map<String, String> getContentHeardMap() {
CONTENT_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
CONTENT_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
CONTENT_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
CONTENT_HEARD_MAP.put("Connection", "keep-alive");
CONTENT_HEARD_MAP.put("Cache-Control", "max-age=0");
if (!HOME_HEARD_MAP.containsKey("Cookie")) {
CONTENT_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; BDSFRCVID_BFESS=oj0OJexroG3V4WbeM-8t8PVZdeKK0gOTDYLtOwXPsp3LGJLVgVbOEG0PtEhTCoub_2AUogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbRO4-TF-jjQyDU5; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598461,1614598932,1614598938,1614651172; delPer=0; PSINO=6; BA_HECTOR=018g2g0l25052l0g3a1g3r9t70q; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e78d3c5a1aeac7bb1e60c92309e4169db1b26e5605e313a1f6752c4b482431b1fe7faa5841c98f1d6409e6296b85974757ebd72ed37a64a1ff6d29f3ddc6e838db0; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614656468; ab_sr=1.0.0_MDAwYzVjYzdmZjhhNzYzNjhlMGZmZWRiN2FkYmJiZGMwM2VmZmJmY2ZmNDljNWRmYWEyNzE2NDk5YzRhNGExNjQzNTEzM2I5YjMwZGRkYTgwMmE1MjQyNWFmNjc1ZGUw; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848bc1e6ab946a724413075895137bad90ae1601113654bfc92f8dec8bf5d540fc12; st_sign=6215e595");
}
CONTENT_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
return CONTENT_HEARD_MAP;
}
/**
* 获取帖子列表
*/
@PostMapping("public/createHome")
public String createHome(String href) throws IOException {
//创建连接,获取响应结果
Document doc = Jsoup.connect(href)
.headers(getHomeHeardMap())
.get();
//过滤贴吧注解
String html = doc.body().toString().replace("<!--", "").replace("-->", "");
if (html.contains("网络不给力")) {
System.out.println("已触发百度反爬虫验证,程序已暂停,请更换贴吧列表cookie,终止链接是:\"" + href + "\"");
System.out.println("在下面粘贴从浏览器复制的cookie");
String next = new Scanner(System.in).nextLine();
//更换cookie
HOME_HEARD_MAP.put("Cookie", next);
//重新调用
createHome(href);
}
Document parse;
try {
parse = Jsoup.parse(html);
} catch (Exception e) {
throw new RuntimeException("解析贴外列表时异常,请尝试更换Jsoup版本");
}
//获取当前页贴子列表
Elements aContent = parse.select("a.j_th_tit");
TotalCount += aContent.size();
for (Element element : aContent) {
String text = element.text();
if ("专用水楼".equals(text)) {
//跳过水楼,因为实在是太大了.....
continue;
}
href = element.attr("href");
System.out.println("进入标题为:" + text + "的帖子,二级地址是:" + href);
content("https://tieba.baidu.com" + href, text);
}
//是否有下一页
Elements select = parse.select("a[class~=^next.pagination-item]");
System.out.println("当前页数为:" + PAGE + ",当前页有:" + aContent.size() + "条帖子");
if (select != null && select.size() == 1) {
Element element = select.get(0);
if ("下一页".equals(element.text().replace(">", ""))) {
href = element.attr("href");
//递归获取数据
PAGE++;
createHome("https:" + href);
}
}
System.out.println("吧内帖子总数为:" + TotalCount + ",总楼层为:" + ContentTotalCount);
return "ok";
}
/**
* 获取帖子内容
*/
public static void content(String contentHref, String titlePath) throws IOException {
//创建连接,获取响应结果
Document doc = Jsoup.connect(contentHref)
.headers(getContentHeardMap())
.get();
//过滤贴吧注解
String html = doc.body().toString().replace("<!--", "").replace("-->", "");
if (html.contains("网络不给力")) {
System.out.println("已触发百度反爬虫验证,程序暂停,请更换贴内cookie,终止链接为:\"" + contentHref + "\"");
System.out.println("在下面粘贴从浏览器复制的cookie");
String next = new Scanner(System.in).nextLine();
//更换cookie
CONTENT_HEARD_MAP.put("Cookie", next);
System.out.println("cookie是:" + CONTENT_HEARD_MAP.get("Cookie"));
//重新调用
content(contentHref, titlePath);
}
Document parse;
try {
parse = Jsoup.parse(html);
} catch (Exception e) {
throw new RuntimeException("解析帖子内容时异常,请尝试更换Jsoup版本");
}
//查找所有楼层
Elements select = parse.select("div[class~=^l_post.l_post_bright.j_l_post.clearfix]");
ContentTotalCount += select.size();
//转义路径符
titlePath = titlePath.replace("/", "-").replace("<", "《")
.replace(">", "》").replace("|","-").replace("\\","");
File file = new File("D:/IO/" + titlePath + ".md");
if (!file.exists()) {
file.createNewFile();
}
//创建文件输出流
FileWriter fileWriter = new FileWriter(file, true);
for (Element allElement : select) {
Elements elementsByAttribute = allElement.getElementsByAttribute("data-locate");
//过滤贴吧坑爹广告
if ("".equals(elementsByAttribute.toString())) {
//查出层主名称
Elements name = allElement.select("a[class~=^p_author_name]");
//查出帖子内容
Elements content = allElement.select("div[class~=^d_post_content.j_d_post_content]");
//查出发帖日期和楼数
Elements span = allElement.select("span[class~=^tail-info]");
String lou = "";
String date = "";
for (Element element : span) {
String text = element.text();
if (text.endsWith("楼")) {
lou = text;
}
if (text.contains("-")) {
date = text;
}
}
//写入数据
fileWriter.write("当前层主为:\"" + name.text() + "\"");
if (!"".equals(content.text())) {
fileWriter.write("\r\n");
fileWriter.write("内容是:\"" + content.text() + "\"");
}
fileWriter.write("\r\n");
System.out.println("当前层主为:\"" + name.text() + ",帖子内容是:" + content.text());
for (Element element : content) {
Elements imgs = element.select("img[class~=^BDE_Image]");
String src = imgs.attr("src");
if (!"".equals(src)) {
fileWriter.write("当前楼层图片内容为:\"" + src + "\"");
fileWriter.write("\r\n");
System.out.println("当前楼层图片内容为:\"" + src + "\"");
}
}
fileWriter.write(lou + " ");
fileWriter.write(date);
fileWriter.write("\r\n");
fileWriter.write("-----------------------------------------------------------------------------------分割线-----------------------------------------------------------------------------------");
fileWriter.write("\r\n");
//刷新缓存
fileWriter.flush();
}
}
//是否有下一页
Elements a = parse.select("li[class~=^l_pager.pager_theme_5]");
if (a.size() != 0) {
Elements a1 = a.get(0).getElementsByTag("a");
for (Element next : a1) {
if ("下一页".equals(next.text())) {
contentHref = next.attr("href");
//递归获取数据
content("https://tieba.baidu.com" + contentHref, titlePath);
}
}
}
//关流
fileWriter.close();
System.out.println("当前帖子楼层数为:" + ContentTotalCount);
ReturnBody.success();
}
}
导入依赖
<!-- jsoup解析HTML -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
注意:如果jsoup版本为1.12.2的时候,jsoup.parse()的时候,在某些特定页面会抛出IoException,更换版本号为1.11.3即可
利用Jsoup创建连接
利用jsoup的connect来和百度贴吧服务器建立连接,类似于HttpClient请求,传入一个请求地址,会返回一个Document对象,这里以“抗压背锅吧”为例子,进入浏览器,获取到贴吧列表的请求地址,作为连接传入到connect方法中
headers是配置请求头,来模仿浏览器进行获取html响应结果,下面会解释。
//创建连接,获取响应结果
Document doc = Jsoup.connect(href)
.headers(getHomeHeardMap())
.get();
配置请求头
由于贴吧对页面请求做了ip限制,当同一个局域网ip内多次请求贴吧列表页面或者贴子内容页面的时候,会弹出百度反爬的一个安全验证,所以我们配置两个通用的请求头来模仿真人操作绕过他的验证
/** 贴吧首页请求头*/
public static final Map<String, String> HOME_HEARD_MAP = new HashMap<>();
/** 贴子内容请求头*/
public static final Map<String, String> CONTENT_HEARD_MAP = new HashMap<>();
public static Map<String, String> getHomeHeardMap() {
HOME_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
HOME_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
HOME_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
HOME_HEARD_MAP.put("Connection", "keep-alive");
//这里为什么要判断cookie会在下面解释
if (!HOME_HEARD_MAP.containsKey("Cookie")) {
HOME_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; delPer=0; PSINO=6; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598932,1614598938,1614651172,1614656659; BCLID=6696522698307635931; BDSFRCVID=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; BCLID_BFESS=6696522698307635931; BDSFRCVID_BFESS=-xLOJeC62AC9at3eh_4A8PV7WjpqhyTTH6aoV9hsteac5gjTXm08EG0PfM8g0Ku-qw2ZogKK3gOTHxKF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbC8RejKBDj5Mbxv0K-vJ--o2LPoV-TrjDnCrqJ7dXUI8LNDH3xt8K6Pe0Rn7JpDWVML63P62Ktk-3bO7ttoyQJ53Q-bHKR8henc2W-F2eML1Db3hW6vMtg3ts4j5tfcoepvoDPJc3MkbyPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEK5r2SC_MJCP53j; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e787570f1fcab17da5132635e67fb49ac3a2ee926ac26ab414b8dc1f022a26b6af0be5ec5feb08e47ecea40e7c3ac42af63418eb176202b934e8a5d65a31f7f67c9; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614661234; st_sign=46af36f3; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848b4e9bf304bec24493f1d19c3b7526fc3ce49228569294a5afc07905bed78d5368; BAIDUID_BFESS=CDDCC97066F658F1A310835A932F3477:FG=1; BA_HECTOR=04ahah0ga1a5800ko11g3ri0j0r; ZD_ENTRY=baidu; ab_sr=1.0.0_MmE3OWNlMWI1NjEwM2RiYTNmNmUwNjRiZTZiOWUxZjNhMWVjOTU5Y2ZjYzM3YTdiNWNhMTU1ZjRiZTFhNzRhZDU3NTk1Y2RkMGU4MzQyMzkzM2U4OTYzZTYxMDE3OGQ1");
}
HOME_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
return HOME_HEARD_MAP;
}
public static Map<String, String> getContentHeardMap() {
CONTENT_HEARD_MAP.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
CONTENT_HEARD_MAP.put("Accept-Encoding", "gzip, deflate, br");
CONTENT_HEARD_MAP.put("Accept-Language", "zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6");
CONTENT_HEARD_MAP.put("Connection", "keep-alive");
CONTENT_HEARD_MAP.put("Cache-Control", "max-age=0");
//同理
if (!HOME_HEARD_MAP.containsKey("Cookie")) {
CONTENT_HEARD_MAP.put("Cookie", "BIDUPSID=45401D87AD2D1AC10DC8EF4AF5BF2AAD; PSTM=1595318555; BAIDUID=45401D87AD2D1AC1B7A11A202D1726BA:FG=1; bdshare_firstime=1595501258246; H_WISE_SIDS=154034_154770_153759_151993_155858_149355_150967_156818_156286_155320_154259_155984_148867_155683_156096_154804_156622_153444_152409_131861_154772_155436_153755_151016_127969_154413_154175_155962_155331_152981_155908_150346_155803_146732_131423_154037_155394_154189_156945_155344_157024_154953_157075_151872_144966_153657_154214_154118_154801_154902_156726_155931_154145_147551_157028_153446_156606_152310_155388_154357_155864_110085_157006; MCITY=-187:; BDSFRCVID_BFESS=oj0OJexroG3V4WbeM-8t8PVZdeKK0gOTDYLtOwXPsp3LGJLVgVbOEG0PtEhTCoub_2AUogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tR-qVIK5tIK3H48k-4QEbbQH-UnLq-RBtgOZ04n-ah05SCb5-4oYqjk3eb3pXt3-W20j0h7m3UTdfh76Wh35K5tTQP6rLtbpKeO4KKJxbp5sShOv5t5rDx_AhUJiB5OMBan7_qvIXKohJh7FM4tW3J0ZyxomtfQxtNRJ0DnjtpChbRO4-TF-jjQyDU5; __yjs_duid=1_39c522ada6e30df532f8d767834b2a8e1614307604471; top_list=4244232993-7182834579; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=33514_33272_33570_33392_33460_22158; st_key_id=17; wise_device=0; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1614598461,1614598932,1614598938,1614651172; delPer=0; PSINO=6; BA_HECTOR=018g2g0l25052l0g3a1g3r9t70q; tb_as_data=f91194c6824894d324c39c29837c6b9c50ec65fab018aeb4e474b20db842845825c96f71f4c6eb6aea2f61716f232e78d3c5a1aeac7bb1e60c92309e4169db1b26e5605e313a1f6752c4b482431b1fe7faa5841c98f1d6409e6296b85974757ebd72ed37a64a1ff6d29f3ddc6e838db0; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1614656468; ab_sr=1.0.0_MDAwYzVjYzdmZjhhNzYzNjhlMGZmZWRiN2FkYmJiZGMwM2VmZmJmY2ZmNDljNWRmYWEyNzE2NDk5YzRhNGExNjQzNTEzM2I5YjMwZGRkYTgwMmE1MjQyNWFmNjc1ZGUw; st_data=305c1867cacda6bc575bc022c406a997445a569b4e6fd53fec92a0642aee94c5d695d65fe4c5360c0f99c161476ba8dc7fb649742c1c4278775ae474ae817ef284ce4f5f60d1c0ad9b4c2c7002d944758ca3e2766e503b929c2f411069f9848bc1e6ab946a724413075895137bad90ae1601113654bfc92f8dec8bf5d540fc12; st_sign=6215e595");
}
CONTENT_HEARD_MAP.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36");
return CONTENT_HEARD_MAP;
}
这里的数据在首页的第一个请求头那里去复制过来即可
接下来就是调用jsoup来获取html内容解析了
//创建连接,获取响应结果
Document doc = Jsoup.connect(href)
.headers(getHomeHeardMap())
.get();
//过滤贴吧注解
String html = doc.body().toString().replace("<!--", "").replace("-->", "");
if (html.contains("网络不给力")) {
System.out.println("已触发百度反爬虫验证,程序已暂停,请更换贴吧列表cookie,终止链接是:\"" + href + "\"");
System.out.println("在下面粘贴从浏览器复制的cookie");
String next = new Scanner(System.in).nextLine();
//更换cookie
HOME_HEARD_MAP.put("Cookie", next);
System.out.println("cookie是:" + HOME_HEARD_MAP.get("Cookie"));
//重新调用
createHome(href);
}
Document parse;
try {
parse = Jsoup.parse(html);
} catch (Exception e) {
throw new RuntimeException("解析贴外列表时异常,请尝试更换Jsoup版本");
}
这里的过滤操作是因为请求贴吧列表的时候,百度会对列表的主内容,也就是body区域做一个注释处理,如下图:
不知道为什么百度会做这样一个操作,在之前的版本中,2020年之前,都不会这样,但是既然他做了,字符串替换掉就是了。
这一部分就是用来判断是否已触发贴吧的安全验证,因为爬取一个吧内的所有帖子,会请求成千上万次页面,达到一定次数后,再去请求就会返回安全验证的页面,自然也就解析不到里面的内容了。
一开始百度的时候,是说添加一个Accept请求头就可以了,但是实际上就算添加了这个请求头,再多次请求后,一样会弹出这个页面,在我多次尝试之后,发现最主要的是安全验证通过之后,贴吧响应回的cookie值,这个cookie保持安全验证的值就可以正常请求响应了,我这里做了一个键盘输入的操作,判断触发验证后阻塞程序,然后去浏览器通过安全验证后复制通过验证的cookie值,粘贴过来,程序递归再次调用自己,就可以了。(也可以做一个请求,线程阻塞,输入cookie后响应回来,这样的操作,看个人喜好,我为了方便,就直接用键盘录入了。)
这里就是刚刚什么请求头为什么要判空的原因,不判空的话,递归调方法,由于getHomeHeardMap()的初始化,还会是一开始配置的过期cookie。
接下来就是分析他的页面结构,f12得知,它的所有帖子列表,包括跳转的二级地址,都是在一个a标签下带有"j_th_tit"class属性的样子的
所以这里我们可以通过jsoup的select查询来查出当前页面的所有帖子列表,并进行操作,这里的思路是获取到当前的帖子列表后,获取到href值,然后调用子方法创立链接,子方法后面贴上
//获取当前页贴子列表
Elements aContent = parse.select("a.j_th_tit");
for (Element element : aContent) {
String text = element.text();
if ("专用水楼".equals(text)) {
//跳过水楼,因为实在是太大了.....
continue;
}
href = element.attr("href");
System.out.println("进入标题为:" + text + "的帖子,二级地址是:" + href);
//content是进入帖子内,创建连接解析的方法
content("https://tieba.baidu.com" + href, text);
}
由于我们要获取吧内所有贴子,所以要获得下一页的地址,f12控制台获得下一页的标签,通过select选取获得数据,遍历后,如果内容为“下一页”,那么就带着下一页的地址,递归调用自身
//判断是否有下一页
Elements select = parse.select("a[class~=^next.pagination-item]");
System.out.println("当前页数为:" + PAGE + ",当前页有:" + aContent.size() + "条帖子");
if (select != null && select.size() == 1) {
Element element = select.get(0);
if ("下一页".equals(element.text().replace(">", ""))) {
href = element.attr("href");
//递归获取每一页的帖子
createHome("https:" + href);
}
}
解析贴内数据
上面一部分开启链接和绕过验证都是相同的就不解释了,这里做了一些io操作和过滤操作,相信开代码都能看懂,都是大同小异的
//查找所有楼层
Elements select = parse.select("div[class~=^l_post.l_post_bright.j_l_post.clearfix]");
ContentTotalCount += select.size();
//转义路径符
titlePath = titlePath.replace("/", "-").replace("<", "《")
.replace(">", "》").replace("|","-").replace("\\","");
File file = new File("D:/IO/" + titlePath + ".md");
if (!file.exists()) {
file.createNewFile();
}
//创建文件输出流
FileWriter fileWriter = new FileWriter(file, true);
for (Element allElement : select) {
Elements elementsByAttribute = allElement.getElementsByAttribute("data-locate");
//过滤贴吧坑爹广告
if ("".equals(elementsByAttribute.toString())) {
//查出层主名称
Elements name = allElement.select("a[class~=^p_author_name]");
//查出帖子内容
Elements content = allElement.select("div[class~=^d_post_content.j_d_post_content]");
//查出发帖日期和楼数
Elements span = allElement.select("span[class~=^tail-info]");
String lou = "";
String date = "";
for (Element element : span) {
String text = element.text();
if (text.endsWith("楼")) {
lou = text;
}
if (text.contains("-")) {
date = text;
}
}
//写入数据
fileWriter.write("当前层主为:\"" + name.text() + "\"");
if (!"".equals(content.text())) {
fileWriter.write("\r\n");
fileWriter.write("内容是:\"" + content.text() + "\"");
}
fileWriter.write("\r\n");
System.out.println("当前层主为:\"" + name.text() + ",帖子内容是:" + content.text());
for (Element element : content) {
Elements imgs = element.select("img[class~=^BDE_Image]");
String src = imgs.attr("src");
if (!"".equals(src)) {
fileWriter.write("当前楼层图片内容为:\"" + src + "\"");
fileWriter.write("\r\n");
System.out.println("当前楼层图片内容为:\"" + src + "\"");
}
}
fileWriter.write(lou + " ");
fileWriter.write(date);
fileWriter.write("\r\n");
fileWriter.write("-----------------------------------------------------------------------------------分割线-----------------------------------------------------------------------------------");
fileWriter.write("\r\n");
//刷新缓存
fileWriter.flush();
}
}
主要的步骤就是获取需要的数据内的class属性名,然后通过select选择器,通过正则去匹配出数据,最后处理输出即可。