要求:
1.给定了一个网页网址(URL),这个就是我们爬虫项目的入口网页,从哪开始爬
http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml
2.把当天的新闻内容全部爬取保存到本地文件中
3.方便以后我们可以迅速查找(在本地文件中)某个新闻,供我们做分析使用
这里为了简化,我们的要求就是找到对应的新闻内容打印输出到控制台System.out.println()
4.爬取的数据按天为单位划分目录,一天生成一个文件夹,文件夹下有2个文件,
一个是数据文件(存储爬取的所有新闻),一个是索引文件(存储某个新闻对应的位置,方便我们查找)。
爬取前先分析网页结构,找到自己需要内容的部分,编写正则表达式
一、建立一个maven项目
二、查取网页内容并储存在对应的数据文件和索引文件
1、建立所需工具包
/**
* 用于关流
* @author Administrator
*
*/
public class CloseUtil {
public static void close(AutoCloseable obj) {
if (obj != null) {
try {
obj.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/**
* 爬取网页内容
* @author Administrator
*
*/
public class WebUtil {
/**
* 将爬取的内容以字符形式返回
* @param urlStr
* @param encoding
* @return
*/
public static String urlGetString(String urlStr, String encoding) {
StringBuffer sb = new StringBuffer();
URL url = null;
URLConnection conn = null;
BufferedReader br = null;
try {
url = new URL(urlStr);
conn = url.openConnection();
br = new BufferedReader(new InputStreamReader(
conn.getInputStream(), encoding));
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line).append(System.lineSeparator());
}
} catch (Exception e) {
e.printStackTrace();
} finally {
CloseUtil.close(br);
}
return sb.toString();
}
/**
* 将爬取内容以字节数组形式返回
* 以便查取对应新闻内容的长度
* @param urlStr
* @return
*/
public static byte[] urlGetByteArray(String urlStr) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
BufferedInputStream bis = null;
byte[] byteArray = new byte[0];
try {
URL url = new URL(urlStr);
URLConnection conn = url.openConnection();
bis = new BufferedInputStream(conn.getInputStream());
int b = -1;
while ((b = bis.read()) != -1) {
baos.write(b);
}
byteArray = baos.toByteArray();
} catch (Exception e) {
e.printStackTrace();
} finally {
CloseUtil.close(bis);
CloseUtil.close(baos);
}
return byteArray;
}
}
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 利用正则表达式从爬去的内容中
* 筛选出需要的内容
* @author Administrator
*
*/
public class RegexUtil {
public static String match(String input, String regex) {
StringBuffer sb = new StringBuffer();
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input);
while (m.find()) {
String result = m.group();
sb.append(result);
}
return sb.toString();
}
public static String match(String input, String regex, int grpNum) {
String result = "";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input);
while (m.find()) {
result = m.group(grpNum);
}
return result;
}
public static List<String> matchList(String input, String regex) {
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(input);
while (m.find()) {
String result = m.group();
list.add(result);
}
return list;
}
}
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
/**
* 将爬取的内容输出到硬盘
* @author Administrator
*
*/
public class IOUtil {
public static void writeDataFile(String dataFile, byte[] ba){
OutputStream os = null;
try{
os = new FileOutputStream(dataFile, true);
os.write(ba);
}catch(Exception e){
e.printStackTrace();
}finally{
CloseUtil.close(os);
}
}
public static void writeIndexFile(String indexFile,String str){
PrintWriter pw = null;
try{
pw = new PrintWriter(new FileOutputStream(indexFile, true));
pw.println(str);
}catch(Exception e){
e.printStackTrace();
}finally{
CloseUtil.close(pw);
}
}
}
2、准备就绪开始爬取
import java.io.File;
import java.util.List;
import cn.dd.util.IOUtil;
import cn.dd.util.RegexUtil;
import cn.dd.util.WebUtil;
public class Spider {
public static void main(String[] args) {
Spider.crawler();
}
public static void crawler() {
String urlStr = "http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml";
String encoding = "gb2312";
String input = WebUtil.urlGetString(urlStr, encoding);// 爬取列表页内容
String ulRegex = "<ul class=\"list_009\">[\\s\\S]*?</ul>";// 正则表达式
String ulResult = RegexUtil.match(input, ulRegex);
String liRegex = "<li>[\\s\\S]*?</li>";// 正则表达式
List<String> list = RegexUtil.matchList(ulResult, liRegex);
for (String str : list) {
String grpRegex = "<li><a href=\"([\\S]*?)\" target=\"_blank\">([\\S\\s]*?)</a><span>\\(([\\S]*?) [\\S]*?\\)</span></li>";
String liUrlStr = RegexUtil.match(str, grpRegex, 1);
String liTitle = RegexUtil.match(str, grpRegex, 2);
String liDate = RegexUtil.match(str, grpRegex, 3);
Spider.detailProcessor(liUrlStr, liTitle, liDate);
}
}
public static void detailProcessor(String liUrlStr, String liTitle,
String liDate) {
byte[] ba = WebUtil.urlGetByteArray(liUrlStr);// 爬取详情页
String fileBaseDir = "F:" + File.separator + "something"
+ File.separator + liDate + File.separator;
File fileBaseDirObj = new File(fileBaseDir);
if (!fileBaseDirObj.exists()) {
fileBaseDirObj.mkdirs();
}
String dataPath = fileBaseDir + "spider_data.dat";
String indexPath = fileBaseDir + "spider_index.dat";
File dataFile = new File(dataPath);
long pos = dataFile.length();
StringBuffer sb = new StringBuffer();
char c = '\u0001';// 将储存的索引文件各部分用分隔符间隔开。
sb.append(liTitle).append(c).append(pos).append(c).append(ba.length)
.append(c).append(liUrlStr);
IOUtil.writeDataFile(dataPath, ba);
IOUtil.writeIndexFile(indexPath, sb.toString());
}
}
爬取内容基本完成,运行可生成对应要求的数据文件和索引文件
三、客户端的建立
1、编写所需工具包
import java.io.RandomAccessFile;
public class IndexUtil {
public static String index(String pos, String size, String dataFile) {
String encoding = "utf-8";
String str = "";
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(dataFile, "r");
Long po = Long.valueOf(pos);
raf.seek(po);
Integer si = Integer.valueOf(size);
byte[] b = new byte[si];
raf.read(b);
str = new String(b, encoding);
} catch (Exception e) {
e.printStackTrace();
} finally {
CloseUtil.close(raf);
}
return str;
}
public static String index(String pos, String size, String dataFile,String encoding) {
String str = "";
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(dataFile, "r");
Long po = Long.valueOf(pos);
raf.seek(po);
Integer si = Integer.valueOf(size);
byte[] b = new byte[si];
raf.read(b);
str = new String(b, encoding);
} catch (Exception e) {
e.printStackTrace();
} finally {
CloseUtil.close(raf);
}
return str;
}
}
2、编写客户端
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import cn.dd.util.CloseUtil;
import cn.dd.util.IndexUtil;
public class SpiderIndex {
public static void main(String[] args) {
String str = "http://news.sina.com.cn/c/nd/2018-08-23/doc-ihicsiav8438010.shtml";
SpiderIndex.input(str);
}
public static void input(String str) {
String indexFile = "F:" + File.separator+ "/something/08月23日/spider_index.dat";
String dataFile = "F:" + File.separator+ "/something/08月23日/spider_data.dat";
BufferedReader bu = null;
try {
bu = new BufferedReader(new InputStreamReader(new FileInputStream(indexFile), "utf-8"));
String len = null;
while ((len = bu.readLine()) != null) {
String[] st = len.split("\u0001");
if (str.equals(st[3])) {
String s = IndexUtil.index(st[1], st[2], dataFile);
System.out.println(s);
break;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
CloseUtil.close(bu);
}
}
}