昨天下午被叫做一个爬虫,爬取某个网站的律师信息,emmmmm,所以就记录一下,方便以后使用
用的是比较老的方法了,作为初学者嘛!有专门做这个的框架Jsoup.效率应该更高,不过把这个弄懂了的话,感觉自己也能写一个类似的框架了,无非就是接下一下html,哪个span或者哪个label对应取值.O(∩_∩)O哈哈~,当然还有效率问题.
package reptile;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.omg.CORBA.INTERNAL;
public class Reptile {
public static void main(String[] args) throws Exception {
//5900
//String url = "http://credit.lawyerpass.com/lawyer-list.jsp?q=&type=lawyer&x=110&y=18&page=4";
// 定义一个字符串用来存储网页内容
/* FileWriter fw = new FileWriter(file,true);
BufferedWriter bw = new BufferedWriter(fw);*/
String fileName = "D:\\test\\lawyerInfo.txt";
File file = new File(fileName);
if(!file.exists())
file.createNewFile();
String targetStr = "id=[0-9a-zA-Z]{32}";//id匹配
//待发出5900个请求
AtomicInteger atoI = new AtomicInteger(1);
//容量上限为50的线程池
ExecutorService es = Executors.newFixedThreadPool(50);
int taskNum = 0;
while(taskNum<=5900){
Runnable task = new Runnable() {
@Override
public void run() {
String url = "http://credit.lawyerpass.com/lawyer-list.jsp?q=&type=lawyer&x=110&y=18&page="+atoI.getAndIncrement();
List<String> result;
try {
result = getPage(url);
lawyerInfo(targetStr, result,file);
} catch (IOException e) {
e.printStackTrace();
}
}
};
es.submit(task);
taskNum++;
}
}
//NIO非阻塞式读写
public static void writeByNIO(String content,File file) {
RandomAccessFile fout = null;
FileChannel fcout = null;
try {
fout = new RandomAccessFile(file, "rw");
long filelength = fout.length();//获取文件的长度
fout.seek(filelength);//将文件的读写指针定位到文件的末尾
fcout = fout.getChannel();//打开文件通道
FileLock flout = null;
while (true) {
try {
flout = fcout.tryLock();//不断的请求锁,如果请求不到,等一秒再请求
break;
} catch (Exception e) {
System.out.print("lock is exist ......");
Thread.currentThread().sleep(1000);
}
}
fout.write(content.getBytes());//将需要写入的内容写入文件
flout.release();
fcout.close();
fout.close();
} catch (IOException e1) {
e1.printStackTrace();
System.out.print("file no find ...");
} catch (InterruptedException e) {
e.printStackTrace();
} finally {
if (fcout != null) {
try {
fcout.close();
} catch (IOException e) {
e.printStackTrace();
fcout = null;
}
}
if (fout != null) {
try {
fout.close();
} catch (IOException e) {
e.printStackTrace();
fout = null;
}
}
}
}
private static void lawyerInfo(String targetStr, List<String> result,File file)
throws MalformedURLException, IOException {
for(String str:result){
String regexString = RegexString( targetStr, str);
if(regexString!=null){
String uuid = regexString.substring(3);
String urlInfo = "http://credit.lawyerpass.com/lawyer.jsp?id=";
List<String> page = getPage(urlInfo+uuid);
String pageStr = "";
for(String str1:page){
pageStr += str1;
}
//System.out.println(pageStr.replaceAll("\"","\'"));
//姓名
String nameReg = "信用主体:<span>.*</span>信用编号";
//执业证号
String zhiyeReg = "执业证号:</label>.*</li>";
//性别
String sexReg = "性别:</label>.*</li>";
//年龄
String ageReg = "年龄:</label>.*</li>";
//民族
String mingzuReg = "民族:</label>.*</li> ";
//邮箱
String emailReg = "email:</label>.*</li>";
//学历
String xueliReg = "学历:</label>.*</li>";
//执业类型
String typeReg = "执业类型:</label>.*</li>";
//政治面貌
String politicalReg = "政治面貌:</label>.*<i class";
//资格证号
String compentencyNumReg = "资格证号:</label>.*</li>";
//所内身份
String lawyerInnerTypeReg = "所内身份:</label>.*</li>";
//主司法公司
String companyReg = "主管司法局:</label>.*</li>";
StringBuilder sb = new StringBuilder();
String name = nameHandle(RegexString(nameReg,pageStr));
if("".equals(name))
name="暂无";
String zhiye = handle(RegexString(zhiyeReg,pageStr));
if("".equals(zhiye))
zhiye="暂无";
String sex = handle(RegexString(sexReg,pageStr));
if("".equals(sex))
sex="暂无";
String age = handle(RegexString(ageReg,pageStr));
if("".equals(age))
age="暂无";
String email = handle(RegexString(emailReg,pageStr));
if("".equals(email))
email="暂无";
String mingzu = handle(RegexString(mingzuReg,pageStr));
if("".equals(mingzu))
mingzu="暂无";
String xueli = handle(RegexString(xueliReg,pageStr));
if("".equals(xueli))
xueli="暂无";
String type = handle(RegexString(typeReg,pageStr));
if("".equals(type))
type="暂无";
String political = politicalHandle(RegexString(politicalReg,pageStr));
if("".equals(political))
political="暂无";
String compentencyNum = handle(RegexString(compentencyNumReg,pageStr));
if("".equals(compentencyNum))
compentencyNum="暂无";
String lawyerInnerType = handle(RegexString(lawyerInnerTypeReg,pageStr));
if("".equals(lawyerInnerType))
lawyerInnerType="暂无";
String company = handle(RegexString(companyReg,pageStr));
if("".equals(company))
company="暂无";
sb.append(name).append(" ").append(zhiye).append(" ").append(sex).append(" ").append(age).append(" ")
.append(mingzu).append(" ").append(xueli).append(" ").append(type).append(" ").append(political).append(" ")
.append(compentencyNum).append(" ").append(lawyerInnerType).append(" ").append(company).append(" ").append(email).append("\r\n");
System.out.println(sb.toString());
//写入文件
writeByNIO(sb.toString(), file);
}
}
}
private static String politicalHandle(String object){
int begin = object.indexOf("</label>")+8;
int end = object.indexOf("<i class");
return object.substring(begin,end);
}
private static String handle(String object){
int begin = object.indexOf("</label>")+8;
int end = object.indexOf("</li>");
return object.substring(begin,end);
}
private static String nameHandle(String object){
int begin = object.indexOf("<span>")+6;
int end = object.indexOf("</span>");
return object.substring(begin,end);
}
private static List<String> getPage(String url) throws MalformedURLException,
IOException {
String result = "";
// 定义一个缓冲字符输入流
BufferedReader in = null;
// 将string转成url对象
URL realUrl = new URL(url);
// 初始化一个链接到那个url的连接
URLConnection connection = realUrl.openConnection();
// 开始实际的连接
connection.connect();
// 初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
List<String> list = new ArrayList<String>();
String line = null;
while((line = in.readLine())!=null){
String str = line.toString();
list.add(str);
}
return list;
}
static String RegexString(String targetStr, String patternStr)
{
// 定义一个样式模板,此中使用正则表达式,括号中是要抓的内容
// 相当于埋好了陷阱匹配的地方就会掉下去
Pattern pattern = Pattern.compile(targetStr);
// 定义一个matcher用来做匹配
Matcher matcher = pattern.matcher(patternStr);
// 如果找到了
if (matcher.find())
{
// 打印出结果
String group = matcher.group(0);
return group.toString();
}
return null;
}
}
本来爬取的速度很慢,所以加入了多线程,运用了AtomicInteger的CAS操作在不需要加锁的情况下保证多线程数据的安全性.使用NIO非阻塞读写代替了普通的IO阻塞读写,提高了爬取数据效率.
还有值得改进的地方就是字符串匹配算法,现在用的是JDK的matcher和pattern.如果数据量更大的话可以修改JVM配置,提高JVM的性能.
这个爬虫复制下来可以直接爬取某个网站的律师数据→_→!!!!!
参考资料: 点击打开链接