爬虫第一天
爬虫
1、 爬虫的入门
1 简单介绍
什么是爬虫?
网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。
2 简单的爬虫编程
package cn.imust;
import org.apache.http.HttpEntity;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class test1 {
public static void main(String[] args) throws Exception {
//1.创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//输入网址
HttpGet httpGet = new HttpGet("http://123.56.125.121:8080/tavel/admin/login.html");
//执行请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//获得请求,判断状态
if (response.getStatusLine().getStatusCode()==200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf8");
System.out.println(content);
}
}
}
控制台输出
酒店管理后台登录
<div class="layui-form-item">
<button class="layui-btn layui-btn layui-btn-normal layui-btn-fluid" lay-submit="" lay-filter="login">登 入</button>
</div>
</form>
</div>
</div>
会输出一个静态的网页源码,即访问成功。
### 3 HttpGet
``不带参数的httpGet``
```java
package cn.imust;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import javax.swing.text.html.parser.Entity;
import java.io.IOException;
public class HttpGetTest {
public static void main(String[] args) {
//创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
String pr= "http://123.56.125.121:8080/tavel/admin/login.html";
String jd = "https://www.jd.com/";
String it= "https://www.itcast.com/";
//输入访问地址
HttpGet httpGet= new HttpGet(it);
CloseableHttpResponse response= null;
//使用httpClient发送请求,获得response
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(),"utf8");
System.out.println(string.length());
}
}catch (IOException e ){
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//解析响应
}
}
带参数的httpGet
package cn.imust;
import com.sun.javafx.fxml.builder.URLBuilder;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpGetParamTest {
public static void main(String[] args) throws Exception{
//创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
//https://www.itcast.com
URIBuilder uriBuilder = new URIBuilder("https://yun.itheima.com/search");
uriBuilder.setParameter("keys","java");
String pr= "http://123.56.125.121:8080/tavel/admin/login.html";
String jd = "https://www.jd.com/";
String it= "https://www.itheima.com/";
//输入访问地址
HttpGet httpGet= new HttpGet(uriBuilder.build());
System.out.println("你的httpGet请求的地址"+httpGet);
CloseableHttpResponse response= null;
//使用httpClient发送请求,获得response
try {
response = httpClient.execute(httpGet);
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(),"utf8");
System.out.println(string.length());
}
}catch (IOException e ){
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//解析响应
}
}
post
package cn.imust;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpPostTest {
public static void main(String[] args) {
//创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
String pr= "http://123.56.125.121:8080/tavel/admin/login.html";
String jd = "https://www.jd.com/";
String it= "https://www.itcast.cn/";
//输入访问地址
HttpPost httpPost= new HttpPost(it);
CloseableHttpResponse response= null;
//使用httpClient发送请求,获得response
try {
response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(),"utf8");
System.out.println(string.length());
}
}catch (IOException e ){
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//解析响应
}
}
相比于get请求 post请求基本没什么变化
但是当带参数时会有变化,相比喻get请求,post请求的参数需要使用list集合存储所需要的key-value。
package cn.imust;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class HttpPostParamTest {
public static void main(String[] args) throws Exception{
//创建对象
CloseableHttpClient httpClient = HttpClients.createDefault();
String pr= "http://123.56.125.121:8080/tavel/admin/login.html";
String jd = "https://www.jd.com/";
String it= "https://www.itcast.cn/";
String it1= "http://yun.itheima.com/search";
//输入访问地址
HttpPost httpPost= new HttpPost(it1);
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("keys","java"));
UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
httpPost.setEntity(formEntity);
CloseableHttpResponse response= null;
//使用httpClient发送请求,获得response
try {
response = httpClient.execute(httpPost);
if (response.getStatusLine().getStatusCode()==200){
String string = EntityUtils.toString(response.getEntity(),"utf8");
System.out.println(string.length());
}
}catch (IOException e ){
e.printStackTrace();
}finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
//解析响应
}
}