<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);"></span><pre name="code" class="java">package zhihu_scrapy;
import java.io.*;
import java.net.*;
import java.util.regex.*;
/*实现爬取只爬取的所有岗位中,只保存前端的岗位*/
public class Intern1 {
public static String open(String url){
BufferedReader in = null;
String line;
String result="";
try {
URL realUrl=new URL(url);
URLConnection connection=realUrl.openConnection();
connection.connect();
in=new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
while((line=in.readLine())!=null){
result+=line;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
if(in!=null){
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result;
}
public static void regex(String content){
String str="";
boolean isfind=false;
Pattern p=Pattern.compile("list_midcontent.+?href=\"(.+?)\".+?blank\">(.+?)<");
Matcher m=p.matcher(content);
isfind=m.find();
while(isfind){
str=m.group(2)+"http://www.hiall.com.cn"+m.group(1);
System.out.println(str);
writer(str);
isfind=m.find();
}
}
/*把爬到的信息存在文本里*/
public static void writer(String result){
try {
FileWriter fw=new FileWriter("C:/Users/Jack/Desktop/intern.txt",true);//在FileWriter中加true表append为true
BufferedWriter out=new BufferedWriter(fw);
out.write(result);
out.newLine();
out.close();//一定要close否则写不成功
} catch (IOException e) {
e.printStackTrace();
}
}
/*实现多页的爬取*/
public static void main(String[] args){
String url="http://www.hiall.com.cn/info/part.php?regions=1&tagid=0&businesses=0&content=0&page=";
for(int i=1;i<50;i++){
String newurl=url+i;
String result=open(newurl);
regex(result);
}
}
}
<span style="font-family: Arial, Helvetica, sans-serif; background-color: rgb(255, 255, 255);">实现在Hiall上面爬取实习的职位,如果需要筛选出包含“前端”的职位,可以在爬下来的文本中加个读取筛选的代买(在网上筛选太久了,好像2次做正则耗时久)</span>
public static void reader(){
String temp="";
try{
FileReader fr=new FileReader("C:/Users/Jack/Desktop/intern.txt");
BufferedReader br=new BufferedReader(fr);
while((temp=br.readLine())!=null){
if(temp.contains("前端")){
writer(temp);
System.out.println(temp);
}
}
}catch(IOException e){
e.printStackTrace();
}
}