一、网页爬虫
在我看来,所谓的网页爬虫就是将网页上你需要的内容筛选出来保存到你的本地而已。那么我们用代码实现以下吧。
package oyz.cn.Httpclent;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class Pc {
public static void main(String[] args) throws ClientProtocolException, IOException {
pwy();
}
public static void pwy() throws ClientProtocolException, IOException {
CloseableHttpClient H=HttpClients.createDefault();
HttpGet get=new HttpGet("https://www.jianshu.com/");
CloseableHttpResponse response = H.execute(get);
HttpEntity entity = response.getEntity();
String result = EntityUtils.toString(entity, "UTF-8");
//System.out.println("网页内容:"+result);
String zz="\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))"; //使用正则表达式匹配连接地址
//System.out.println(result);
Pattern p=Pattern.compile(zz);
Matcher m=p.matcher(result);
int c=1;
FileWriter x=new FileWriter("C:\\Users\\Administrator\\Desktop\\连接地址.txt");
while(m.find())
{
System.out.println("爬回来的连接地址"+c+"是 : " +m.group());
String a=m.group();
x.write("爬回来的连接地址"+c+"是 : "+m.group());
x.write("\r\n\r\n");
c++;
}
x.close();
response.close();
H.close();
}
}
效果图: