大二下java设计课程需要用java爬虫来着,记得当时最后爬的小蓝鸟,优势代理又是配置的,反正很折磨,这是当时学jsoup和httpclient的笔记,不是很全,很简单的内容,b站教程都有讲。
最后实在不建议用java爬虫,太折磨。
Jsoup页面解析
<!-- jsoup maven依赖 -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<!-- 测试 -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!-- 文件操作工具 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<!-- 字符串操作工具 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.8.1</version>
</dependency>
Jsoup解析
//通过URL解析
@Test
public void urlTest() throws Exception{
//解析url地址 paras(访问url,访问超时时间(ms))
Document dom = Jsoup.paras(new URL("需要请求的地址"),1000);
//通过标签选择器获得需要的元素并操作
//.text()获取标签文本内容
String title = dom.getElementsByTag("title").first().text();
System.out.println(title);
}
//解析字符串
@Test
public void stringTest() throws Exception{
//使用工具类读取文件,获取字符串
String content = FileUtils.readFileToString(new File("文件地址"), "utf8");
//解析字符串
Document dom = Jsoup.parse(content);
String title = dom.getElementsByTag("title").first().text();
System.out.println(title);
}
//解析文件
@Test
public void fileTest() throws Exception{
Document dom = Jsoup.parse(new File("文件地址"),"utf8");
String title = dom.getElementsByTag("title").first().text();
System.out.println(title);
}
通过各种方法获取元素:
通过id查询元素getElementById()
通过标签获取元素getElementsByTag()
更具class获取元素getElementsByClass()
根据属性获取元素getElementsByAttribute()
通过各种方法获取元素:
@Test
public void dataTest(){
//解析文本获取Document
Document dom = Jsoup.parse(new File("需要打开的解析文本的地址"),"utf8");
//通过各种方法取得元素
Element element_id = dom.getElementById("需要取的Id");
//从元素中获取id
System.out.println("id中获取到的内容是: " + element_id.id());
//输出取得的所有class
System.out.println("className: " + element_id.className());
//循环打印class中的每个元素值
Set<String> classSet = element_id.classNames();
for(String str : classSet){
System.out.println(str);
}
//从元素中获取属性的值attr
System.out.println("element_id中对应的属性值: " + element_id.attr("对应属性名"))
//从元素中获取所有的属性attributes
Attributes attributes = element_id.attributes();
System.out.println(attributes.toString());
//获得所有文本内容text
System.out.println("element_id中的文本值: " + element_id.text());
}
Selector选择器查询元素
@Test
public void selectorTest(){
Document dom = Jsoup.parse(new URL("需要解析的地址"),1000);
//通过标签名查找元素
Elements elements_tag = dom.select("需要查找的标签名字");
for(Element e : elements_tag){
System.out.println(e.text());
}
//通过id查找元素
Element element_id = dom.select("#需要的id").first();
System.out.println(element_id.text());
//通过class名称查找元素
Elements elements_class = dom.select(".需要的class");
//Element element_class = dom.select(".需要的class").first();
for(Element e : elements_class){
System.out.println(e.text());
}
//利用attr查找元素
Elements elements_attr = dom.select("[需要的attr]");
for(Element e : elements_attr){
System.out.println(e.text());
}
//利用attr和属性值查找
Elements elements_attrbutes = dom.select("[属性名 = 属性值]");
for(Element e : elements_attrbutes){
System.out.println(e.text());
}
}
选择器组合使用
@Test
public void selectorCombinedTest(){
Document dom = Jsoup.parse(new URL("需要解析的地址"),1000);
//tag + id
Element tag_id = dom.select("需要的tag#需要的id").first();
//tag + class
Element tag_class = dom.select("需要的tag.需要的class").first();
//tag + 属性名
Element tag_attr = dom.select("需要的tag[需要的attr]").first();
//父 -> 子 查询所有子元素
Elements elements = dom.select(".父对象class 子对象tag");
//父 -> 子 查询直接子元素(第一级tag) 可以嵌套查询
Elements elements01 = dom.select(".父对象class>直接子元素tag");
//父 -> 子 查询所有直接子元素
Elements elements02 = dom.select(".父对象class>*");
}
HttpClient页面抓取
HttpClient用于页面抓取
Get请求默认爬取
public static void main(String[]args) throws IOException {
//创建HttpClient对象(浏览器)
CloseableHttpClient httpClient = HttpClients.createDefault();
//HttpGet对象参数支持传送uri
HttpGet httpGet = new HttpGet("需要的网址");
//创建Response对象
CloseableHttpResponse response = null;
try{
response = httpClient.execute(httpGet);
//若response取得的状态码为200则判断连接成功
if(response.getStatusLine.getStatusCode == 200){
String content = Entityutils.toString(response.getEntity(),"utf8");
System.out.println(content);
}
}catch(Exception e){
e.printStackTrace();
}finally{
//关闭response和client对象
if(response != null)
response.close;
httpClient.close;
}
}
Get带参数请求
public static void main(String[]args) throws IOException, URISyntaxException {
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建URIBuilder对象并设置请求参数
URIBuilder uriBuilder = new URIBuilder("需要请求的地址");
uriBuilder.setParamter("请求参数名1","请求参数值1").setParamter("请求参数名2","请求参数名2");
//传入uriBuilder.build()获取uri对象
HttpGet httpGet = new HttpGet(uriBuilder.build());
CloseableHttpResponse response = null;
try{
response = httpClient.execute(httpGet);
if(response.getStatusLine.getStatusCode == 200){
String content = Entityutils.toString(response.getEntity(),"utf8");
System.out.println(content);
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(response != null)
response.close;
httpClient.close;
}
}
Post请求
public static void main(String[]args) throws IOException{
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建HttpPost对象
HttpPost httpPost = new HttpPost("需要请求的网址");
CloseableHttpResponse response = null;
try{
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(),"utf8");
System.out.println(content);
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(response != null)
response.close();
httpClient.close();
}
}
Post带参数请求
public static void main(String[]args) throws IOException {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpPost httpPost = new HttpPost("需要请求的地址");
//声明List集合,封装表单中的参数
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValurPair("请求名","请求值"));
//创建表单中的Entity对象
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
//设置表单中的Entity对象到Post请求中
httpPost.setEntity(formEntity);
CloseableHttpResponse response = null;
try{
response = httpClient.execute(httpPost);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(),"utf8");
System.out.println(content);
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(response != null)
response.close();
httpClient.close();
}
}
HttpClient连接池 / HttpClient请求参数设置
public static void main(String[]args) throws IOException{
//创建连接池管理器
PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
//设置管理器参数
cm.setMaxTotal(100);//最大连接数
cm.setDefaultMaxPerRoute(10);//每台主机最大连接数
//使用连接池发送参数
doGet(cm);
doGet(cm);
}
public static void doGet(PoolingHttpClientConnectionManager cm) throws IOException {
//每次创建从连接池管理器中获得HttpClient对象
CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).builder();
HttpGet httpGet = new HttpGet("需要请求的网址");
//配置请求参数
RequestConfig config = RequestConfig.custom()
.setConnectionRequestTimeout(500)//设置获取连接的最长时间
.setConnectTimeout(1000)//设置连接的最长时间(毫秒)
.setSocketTimeout(10*1000)//设置数据传输的最长时间(毫秒)
.build();//得到uri对象
httpGet.setConfig(config);
CloseableHttpResponse response = null;
try{
response = httpClient.execute(httpGet);
if(response.getStatusLine().getStatusCode() == 200){
String content = EntityUtils.toString(response.getEntity(), "utf8");
System.out.println(content.length());
}
}catch (Exception e){
e.printStackTrace();
}finally {
if(response != null)
response.close();
//不可以关闭HttpClient对象
//httpClient.close();
}
}