Java网络爬虫(jsoup)
1.爬虫技术是做从网页上抓取数据信息并保存的自动化程序,它的原理就是模拟浏览器发送网络请求,接受请求响应,然后按照一定的规则自动抓取互联网数据。搜索天眼查搜索页面,并分析页面
https://www.tianyancha.com/search?key=%E8%85%BE%E8%AE%AF&sessionNo=1670982628.10251085
目标元素:如下
1.1引入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2.爬取数据(获取请求返回的页面信息,筛选出可用的)
创建HtmlParseUtil,并简单编写
package com.jxj.elasticsearch.utils;
import com.jxj.elasticsearch.pojo.Content;
import com.sun.scenario.effect.impl.sw.sse.SSEBlend_SRC_OUTPeer;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.List;
public class HtmlParseUtil {
public static void main(String[] args) throws IOException {
System.out.println(parseJD("java"));
}
public static List<Content> parseJD(String keyword) throws IOException {
/// 使用前需要联网
// 请求url
//String url = "http://search.jd.com/search?keyword=" + keyword;
String url = "https://www.tianyancha.com/search?key=%E8%85%BE%E8%AE%AF&sessionNo=1670982628.10251085";
// 1.解析网页(jsoup 解析返回的对象是浏览器Document对象)
Document document = Jsoup.parse(new URL(url), 30000);
// 使用document可以使用在js对document的所有操作
// 2.获取元素(通过id)
Elements j_goodsList = document.getElementsByClass("index_search-item__W7iG_");
Element element = j_goodsList.get(0);
System.out.println(element);
System.out.println("===========================");
// System.out.println(element);
Elements elements = document.getElementsByClass("index_search-item__W7iG_");
//企业图片
String qyUrl = elements.get(0).getElementsByClass("index_search-item-left__eTjDK")
.get(0).getElementsByClass("index_hover-image-claim__6PJ3T").get(0)
.getElementsByClass("_1efed _76350 index_item-logo__aquZ_ _c46e6 _3426b").get(0)
.getElementsByTag("img").eq(0).attr("src");
System.out.println("企业图片:"+qyUrl);
//企业公司
String qyName = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_header__x2QZ3").get(0)
.getElementsByClass("index_name__qEdWi").get(0)
.getElementsByTag("a").eq(0).text();
System.out.println("企业公司:"+qyName);
//企业法人
String qyFrName = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_info-row__xbtyD index_line-row__R3mCi").get(0)
.getElementsByClass("index_info-col__UVcZb index_wider__gQok0").get(0)
.getElementsByTag("a").eq(0).text();
System.out.println("企业法人:"+qyFrName);
//企业注册资金
String qyZcZj = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_info-row__xbtyD index_line-row__R3mCi").get(0)
.getElementsByClass("index_info-col__UVcZb index_narrow__QeZfV").get(0)
.getElementsByTag("span").eq(0).text();
System.out.println("企业注册资金:"+qyZcZj);
//企业注册时间
String qyClsj = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_info-row__xbtyD index_line-row__R3mCi").get(0)
.getElementsByClass("index_info-col__UVcZb").get(0)
.getElementsByTag("span").eq(0).text();
System.out.println("企业注册时间:"+qyClsj);
//手机号
String qyPhone = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_contact-row__iYUn6 index_line-row__R3mCi").get(0)
.getElementsByClass("index_contact-col__7AboU").get(0)
.getElementsByTag("span").eq(1).text();
System.out.println("手机号:"+qyPhone);
//邮箱
String qyEmile = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_contact-row__iYUn6 index_line-row__R3mCi").get(0)
.getElementsByClass("index_contact-col__7AboU").get(1)
.getElementsByTag("span").eq(1).text();
System.out.println("邮箱:"+qyEmile);
//地址
String qyAddress = elements.get(0).getElementsByClass("index_search-item-center__Q2ai5")
.get(0).getElementsByClass("index_contact-row__iYUn6 index_line-row__R3mCi").get(1)
.getElementsByClass("index_contact-col__7AboU").get(0)
.getElementsByTag("span").eq(1).text();
System.out.println("地址:"+qyAddress);
return null;
}
}
结果