网络爬虫(一)

一.http请求发送的三种方式

    1.jdk自带的url路径访问

    1.1通过get请求获取百度首页的html文本内容

	@Test
	public void testName() throws Exception {
		//指定网址 www.baidu.com
		URL url = new URL("http://www.baidu.com");
		//获取连接
		URLConnection connection = url.openConnection();
		//得到相应内容
		InputStream inputStream = connection.getInputStream();
		
		//读取数据
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
		StringBuffer stringBuffer = new StringBuffer();
		String line="";
		while ((line=bufferedReader.readLine())!=null) {
			stringBuffer.append(line);
		}
		System.out.println(stringBuffer);
	}

    1.2通过post请求获取百度首页的html文本内容

	@Test
	public void testName2() throws Exception {
		// 指定网址
		URL url = new URL("http://www.baidu.com");
		// 获取连接内容
		HttpURLConnection openConnection = (HttpURLConnection) url.openConnection();
		//设置请求方式
		openConnection.setRequestMethod("POST");
		//模拟输出
		openConnection.setDoOutput(true);
		//设置携带的参数
		OutputStream outputStream = openConnection.getOutputStream();
		outputStream.write("username=zhangsan&password=list".getBytes());
		// 得到响应内容
		InputStream inputStream = openConnection.getInputStream();
		// 解析
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
		// 拼接得到数据
		StringBuffer stringBuffer = new StringBuffer();
		String line="";
		while ((line=bufferedReader.readLine())!=null) {
			stringBuffer.append(line);
		}
		System.out.println(stringBuffer);
	}

  

  2.使用HttpClient

    2.1导包

<dependency>
				<groupId>org.apache.httpcomponents</groupId>
				<artifactId>httpclient</artifactId>
				<version>4.5.2</version>
</dependency>
<dependency>
				<groupId>org.apache.httpcomponents</groupId>
				<artifactId>httpcore</artifactId>
				<version>4.4.6</version>
</dependency>

   2.2HttpClient发送get请求

	@Test
	public void testName5() throws Exception {
		//创建请求
		CloseableHttpClient client = HttpClients.createDefault();
		//设置请求方式及网址
		HttpGet httpGet = new HttpGet("http://www.baidu.com");
		//获取连接内容
		CloseableHttpResponse response = client.execute(httpGet);
		//放在桥上==>Entity中
		HttpEntity entity = response.getEntity();
		//找工具获取==>导包中有一个工具类可以使用
		String string = EntityUtils.toString(entity);
		System.out.println(string);
	}
	
   2.2HttpClient发送post请求
	@Test
	public void testName6() throws Exception {
		//获取请求连接
		CloseableHttpClient client = HttpClients.createDefault();
		//设置请求方式及网址
		HttpPost post = new HttpPost("https://www.meitulu.com/");
		
		List<BasicNameValuePair> parameters=new ArrayList<BasicNameValuePair>();
		BasicNameValuePair basicNameValuePair = new BasicNameValuePair("username", "zhangsan");
		BasicNameValuePair basicNameValuePair2 = new BasicNameValuePair("password", "123");
		parameters.add(basicNameValuePair);
		parameters.add(basicNameValuePair2);

		UrlEncodedFormEntity entity=new UrlEncodedFormEntity(parameters);
		//设置请求内容
		post.setEntity(entity);
		//获得响应的内容
		CloseableHttpResponse response = client.execute(post);
		//获得响应码为200
		int statusCode = response.getStatusLine().getStatusCode();
		if (statusCode==200) {
			//放在桥上
			HttpEntity entity2 = response.getEntity();
			//找工具获得
			String string = EntityUtils.toString(entity2,Charset.forName("UTF-8"));
			
			System.out.println(string);
			
		}
		
	}

    3Jsoup来实现解析网页内容

    3.1导包

<dependency>
		  <!-- jsoup HTML parser library @ https://jsoup.org/ -->
		  <groupId>org.jsoup</groupId>
		  <artifactId>jsoup</artifactId>
		  <version>1.10.3</version>
		</dependency>

    3.2get请求获取文档内容

	@Test
	public void testName7() throws Exception {
		//设置网址及请求方式 获取文档内容
		Document document = Jsoup.connect("https://www.meitulu.com").data("username", "zhangsan") .get();
		System.out.println(document.toString());
	}

    3.3POST请求获取文档内容

	@Test
	public void testName8() throws Exception {
		Document post = Jsoup.connect("https://www.meitulu.com").post();
		System.out.println(post.toString());
	}

3.4Jsoup解析html页面

	@Test
	public void testName9() throws Exception {
		Document document = Jsoup.connect("http://news.163.com/18/0515/21/DHSKCHBK000187VE.html").get();
		System.out.println(document.select("#epContentLeft > h1").text());
	}



阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭