爬虫是什么?
爬虫就是通过发送请求,获取网页数据,然后进行解析存储的一个过程
============================================================================================================
爬虫的分类
1:垂直爬虫:针对某一类网站,专门爬取
参考案例:http://www.manmanbuy.com/
专门针对一些大型的电商网站进行爬取
2:通用爬虫:针对万维网,所有的网站都抓取google
===========================================================================
Http请求发送的三种方式
1.使用jdk自带的api发送http请求
@Test
public void jdk_get() throws Exception {
// 获取请求路径
URL url = new URL("http://www.xxx.com");
// 获取网络连接
URLConnection openConnection = url.openConnection();
// 获取响应回来的输入流
InputStream inputStream = openConnection.getInputStream();
// 将输入流转成缓冲字符流
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
// 拼接输出的字符串
StringBuffer sb = new StringBuffer();
// 遍历字符流中的数据
String s = null;
while ((s = br.readLine()) != null) {
// 拼接到输出的字符串
sb.append(s);
// 换行
sb.append("\r\n");
}
System.out.println(sb);
inputStream.close();
}
@Test
public void jdk_post() throws Exception {
// 获取请求路径
URL url = new URL("http://www.itcast.cn");
// POST方式
// 获取网络连接使用URLConnection抽象类的子类,需要用子类中的setRequestMethod方法来设置请求方式
HttpURLConnection openConnection = (HttpURLConnection) url.openConnection();
// 设置请求方式位POST
openConnection.setRequestMethod("POST");
// 允许我们的请求方式携带输出流
openConnection.setDoOutput(true);
// 获取输出流
OutputStream outputStream = openConnection.getOutputStream();
outputStream.write("username=zhangsan".getBytes());
// 获取输入流
InputStream inputStream = openConnection.getInputStream();
// 将输入流转成缓冲字符流
// BufferedInputStream bis = new BufferedInputStream(inputStream);
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
// 拼接输出的字符串
StringBuffer sb = new StringBuffer();
// 遍历字符流中的数据
String str = null;
while ((str = br.readLine()) != null) {
sb.append(str);
sb.append("\r\n");
}
br.close();
}
============================================================================
2.使用HttpClient发送请求(apache出品的一个专门用于网络请求的jar包)
//get请求
@Test
public void httpClient_get() throws Exception {// 获取http客户端
CloseableHttpClient client = HttpClients.createDefault();
// 设置请求方式和请求路径
HttpGet hg = new HttpGet("http://www.itcast.cn");
// 执行请求返回响应
CloseableHttpResponse response = client.execute(hg);
// 将响应的数据放到HttpEntity中
HttpEntity entity = response.getEntity();
// 通过工具类将entiyt中的数据转成字符串,设置编码集解决乱码
String str = EntityUtils.toString(entity, "UTF-8");
// String str = EntityUtils.toString(entity,Charset.forName("UTF-8"));
System.out.println(str);
}
//post请求
@Test
public void httpClient_post() throws Exception {
// 获取客户端
CloseableHttpClient client = HttpClients.createDefault();
// 设置请求方法,请求路径传递请求参数
HttpPost hp = new HttpPost("http://www.itcast.cn");
// 设置请求参数
List<BasicNameValuePair> parameters = new ArrayList<>();
parameters.add(new BasicNameValuePair("username", "zhangsan"));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(parameters);
hp.setEntity(entity);
// 执行请求返回响应数据
CloseableHttpResponse response = client.execute(hp);
// 将响应的数据放到HttpEntity中
HttpEntity entity1 = response.getEntity();
// 通过工具类将entiyt中的数据转成字符串,设置编码集解决乱码
String str = EntityUtils.toString(entity1, "UTF-8");
// String str = EntityUtils.toString(entity,Charset.forName("UTF-8"));
System.out.println(str);
}
===========================================================================
3.Jsoup:专业用于网页解析的一个工具,擅长解析网页,也可以发送请求
//发送get和post请求
@Test
public void jsoup() throws Exception {
//获取连接设置请求路径
Connection connect = Jsoup.connect("http://www.itcast.cn");
//设置请求参数
connect.data("username", "zhangsan");
//设置请求方法并发送请求得到返回结果
Document document = connect.get();
//Document document=connect.post();
System.out.println(document);
}
=============================================================================================================
使用jsoup进行网页解析
// 解析美团酒店名称
@Test
public void method7() throws Exception {
//传入要解析的网页路径,设置超时时间
Document dc = Jsoup.parse(new URL("http://hotel.meituan.com/beijing/"), 5000);
//获取查询的数据
Elements elements = dc.select("#list-view > div.poi-results > article > div.picture-wrapper > a > img");
for (Element element : elements) {
//得到图片的alt属性(图片的请求路径)
System.out.println(element.attr("alt"));
}
}
======================================================================
模拟登录
@Test
public void method10() throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
HttpPost hp = new HttpPost("http://www.svn.club/user/login");
List<BasicNameValuePair> parameters = new ArrayList<>();parameters.add(new BasicNameValuePair("uid", "xxxx"));
parameters.add(new BasicNameValuePair("pwd", "xxxxxxxxx"));
parameters.add(new BasicNameValuePair("x", "81"));
parameters.add(new BasicNameValuePair("y", "23"));
UrlEncodedFormEntity entity = new UrlEncodedFormEntity(parameters);
hp.setEntity(entity);
CloseableHttpResponse response = client.execute(hp);
//重定向
Header[] headers = response.getHeaders("Location");
for (Header header : headers) {
//System.out.println(header.getValue());
HttpGet hg = new HttpGet("http://www.svn.club" + header.getValue());
CloseableHttpResponse response2 = client.execute(hg);
HttpEntity entity2 = response2.getEntity();
System.out.println(EntityUtils.toString(entity2,"UTF-8"));
}
}