爬取网址:
https://movie.douban.com/cinema/nowplaying/yantai/
网页的基本信息从网页源代码可以看出来,用正则表达式爬的,不过因为是初学,所以用的比较烂,代码也比较low,贴出来当个笔记以后没事翻出来看看。
import java.util.*;
import java.net.URL;
import java.util.regex.*;
import java.io.*;
import java.lang.*;
public class WebCrawler {
public static void main(String[] args) {
// TODO Auto-generated method stub
Scanner input=new Scanner(System.in);
System.out.println("豆瓣电影在线购票&影讯");
String url="https://movie.douban.com/cinema/nowplaying/yantai/";
crawler(url);
}
public static void crawler(String startingURL)
{
ArrayList<String> listOfPendingURLs=new ArrayList<>();
ArrayList<String> listOfTraversedURLs=new ArrayList<>();
listOfPendingURLs.add(startingURL);
while(!listOfPendingURLs.isEmpty()&&listOfTraversedURLs.size()<=100)
{
String urlString=listOfPendingURLs.remove(0);
if(!listOfTraversedURLs.contains(urlString))
{
listOfTraversedURLs.add(urlString);
System.out.println("Crawl "+urlString);
for(String s:getSubURLs(urlString))
{
try {
if(!listOfTraversedURLs.contains(s))
listOfPendingURLs.add(s);
Thread.sleep(1600);
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
}
}
}
}
public static ArrayList<String> getSubURLs(String urlString)
{
ArrayList<String> list=new ArrayList<>();
try {
URL url=new URL(urlString);
Scanner input=new Scanner(url.openStream());
int current=0;
while(input.hasNext())
{
String line=input.nextLine();
current=line.indexOf("https://movie.douban.com/subject",current);//爬网址
while(current>0)
{
int endIndex=line.indexOf("playing_poster",current);//爬网址
if(endIndex>0)
{
list.add(line.substring(current, endIndex));
current=line.indexOf("http:", endIndex);
}
else
current=-1;
}
}
try {
Thread.sleep(1600);
getinformation g=new getinformation(urlString);
g.GetIf();
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
return list;
}
}
class getinformation//获取电影信息
{
private
String urlString;
final String title="(data-title=\".*\")";//匹配电影标题的正则表达式
final String comment="(<span class=\"short\">[\\s\\S.]*)";
final String genre="\"v:genre\">[\u4E00-\u9FA5]+";
final String user=" <a href=\"https://www.douban.com/people/.*/\" class=\"\">.*</a>";
final String shortnum="<a href=\"https://movie.douban.com/subject/.*/comments\\?status=P\">.*<";
String temp2="0";//每个评论之前显示的标题
getratingValue grv;
getStaff gsf;
public
getinformation(String u)
{
urlString=u;
}
void GetIf()
{
try(
BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
)
{
URL url=new URL(urlString);
Scanner input=new Scanner(url.openStream());
//getratingValue(urlString);//获取评分等相关信息
// getStaff(urlString);
Pattern titleP=Pattern.compile(title);
Pattern genreP=Pattern.compile(genre);
Pattern userP=Pattern.compile(user);
Pattern shortnumP=Pattern.compile(shortnum);
Pattern commentP=Pattern.compile(comment);
while(input.hasNext())
{
String line=input.nextLine();
Matcher titleM=titleP.matcher(line);
Matcher shortnumM=shortnumP.matcher(line);
Matcher commentM=commentP.matcher(line);
Matcher genreM=genreP.matcher(line);
Matcher userM=userP.matcher(line);
if(titleM.find())
{
String temp=titleM.group(1);
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='《'||temp.charAt(i)=='\"')
{
temp2=temp.substring(i+1,temp.length()-1);
break;
}
System.out.println(temp2);//匹配电影标题
output.write(temp2+"\r\n");
output.flush();
}
if(shortnumM.find())
{
String temp=shortnumM.group(0),temp1="0";
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='>')
temp1=temp.substring(i+1,temp.length()-4);
System.out.println("短评数量: "+temp1+"条");
output.write("短评数量: "+temp1+"条"+"\r\n");
output.flush();
}
if(userM.find())
{
String temp=userM.group(0),temp1="0";
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='>')
{
temp1=temp.substring(i,temp.length()-4);
break;
}
System.out.print(temp2+"用户: "+temp1);//匹配评论
output.write(temp2+"用户: "+temp1);
output.flush();
}
if(commentM.find())
{
String temp=commentM.group(1),temp1="0";
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='>')
{
temp1=temp.substring(i,temp.length()-7);
break;
}
System.out.println(" "+"---短评: "+temp1);//匹配评论
output.write(" "+"---短评: "+temp1+"\r\n");
output.flush();
}
if(genreM.find())
{
System.out.print("类型: ");
output.write("类型: ");
output.flush();
}
while(genreM.find())
{
String temp=genreM.group(0),temp1="0";
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='>')
{
temp1=temp.substring(i+1,temp.length());
break;
}
System.out.print(temp1+'/'+" ");
output.write(temp1+'/'+" ");
output.flush();
}
}
try {
Thread.sleep(1600);
grv=new getratingValue(urlString);
grv.GetRV();
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
System.out.println(temp2+"演员及导演");
output.write(temp2+"演员及导演"+"\r\n");
output.flush();
try {
Thread.sleep(1600);
gsf=new getStaff(urlString);
gsf.GetS();
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
//output.close();
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
}
}
class getratingValue//获取电影评分以及上映日期
{
private
String urlString;
final String votes="\"v:votes\">\\d+";
final String ratingValue="\"ratingValue\":.*";
final String releasedate="content=\".*\\(中国大陆\\)\"";
String rd="0",V="0";
public
getratingValue(String u)
{
urlString = u;
}
void GetRV()
{
try(
BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
)
{
URL url=new URL(urlString);
Scanner input=new Scanner(url.openStream());
Pattern releasedateP=Pattern.compile(releasedate);
Pattern votesP=Pattern.compile(votes);
Pattern ratingValueP=Pattern.compile(ratingValue);
while(input.hasNext())
{
String line=input.nextLine();
Matcher votesM=votesP.matcher(line);
Matcher releasedateM=releasedateP.matcher(line);
Matcher ratingValueM=ratingValueP.matcher(line);
if(votesM.find())
{
String temp=votesM.group(0);
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='>')
{
V=temp.substring(i+1,temp.length()-1);
break;
}
System.out.println("评分人数: "+V+" ");
output.write("评分人数: "+V+" "+"\r\n");
output.flush();
}
if(ratingValueM.find())
{
System.out.print(ratingValueM.group(0));
output.write(ratingValueM.group(0));
output.flush();
}
if(releasedateM.find())
{
String temp=releasedateM.group(0);
for(int i=0;i<temp.length();i++)
if(temp.charAt(i)=='\"')
{
rd=temp.substring(i+1,temp.length()-1);
break;
}
System.out.print("上映日期: "+rd);
output.write("上映日期: "+rd);
output.flush();
}
}
//output.close();
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
}
}
class getStaff//获取电影人员
{
private
String urlString;
final String actor="\"v:starring\">.*<";
final String direct="\"v:directedBy\">.*<";
public
getStaff(String u)
{
urlString=u;
}
void GetS()
{
try(
BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
)
{
URL url=new URL(urlString);
Scanner input=new Scanner(url.openStream());
Pattern actorP=Pattern.compile(actor);
Pattern directP=Pattern.compile(direct);
while(input.hasNext())
{
String line=input.nextLine();
Matcher actorM=actorP.matcher(line);
Matcher directM=directP.matcher(line);
if(actorM.find())
{
final String name="[\\u4e00-\\u9fa5|'·']+";
String temp=actorM.group(0);
Pattern nameP=Pattern.compile(name);
Matcher nameM=nameP.matcher(temp);
while(nameM.find())
{
System.out.println("主演: "+nameM.group(0));
output.write("主演: "+nameM.group(0)+"\r\n");
output.flush();
}
}
if(directM.find())
{
final String name="[\\u4e00-\\u9fa5|'·']+";
String temp=directM.group(0);
Pattern nameP=Pattern.compile(name);
Matcher nameM=nameP.matcher(temp);
while(nameM.find())
{
System.out.println("导演: "+nameM.group(0));
output.write("导演: "+nameM.group(0)+"\r\n");
output.flush();
}
}
}
//output.close();
}
catch(Exception ex)
{
System.out.println("Error: "+ex.getMessage());
}
}
}
怕被豆瓣封,所以降低了一下爬虫的速率,所以爬的比较慢。
效果图:
抓取的评论,演员导演,题目和上映日期等信息,之后存到文件中。
BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
output.write()和output.flush(),如果没有output.flush可能写文件的时候和控制台输出的顺序不一样。
顺便贴出来正则表达式的学习网站:www.runoob.com/java/java-regular-expressions.html
http://www.runoob.com/regexp/regexp-syntax.html