爬虫爬取豆瓣电影资讯

爬取网址:

https://movie.douban.com/cinema/nowplaying/yantai/

网页的基本信息从网页源代码可以看出来,用正则表达式爬的,不过因为是初学,所以用的比较烂,代码也比较low,贴出来当个笔记以后没事翻出来看看。

import java.util.*;
import java.net.URL;
import java.util.regex.*;
import java.io.*;
import java.lang.*;
public class WebCrawler {

	public static void main(String[] args) {
		// TODO Auto-generated method stub
		Scanner input=new Scanner(System.in);
		System.out.println("豆瓣电影在线购票&影讯");
		String url="https://movie.douban.com/cinema/nowplaying/yantai/";
		crawler(url);
	}  
	public static void crawler(String startingURL)
	{
		ArrayList<String> listOfPendingURLs=new ArrayList<>();
		ArrayList<String> listOfTraversedURLs=new ArrayList<>();
		listOfPendingURLs.add(startingURL);
		while(!listOfPendingURLs.isEmpty()&&listOfTraversedURLs.size()<=100)
		{
			String urlString=listOfPendingURLs.remove(0);
			if(!listOfTraversedURLs.contains(urlString))
			{
				listOfTraversedURLs.add(urlString);
				System.out.println("Crawl "+urlString);
				for(String s:getSubURLs(urlString))
				{
					try {
						if(!listOfTraversedURLs.contains(s))
							listOfPendingURLs.add(s);
						Thread.sleep(1600);
					}
					catch(Exception ex)
					{
						System.out.println("Error: "+ex.getMessage());
					}
				}
			}
		}
	}
	public static ArrayList<String> getSubURLs(String urlString)
	{
		ArrayList<String> list=new ArrayList<>();
		try {
			URL url=new URL(urlString);
			Scanner input=new Scanner(url.openStream());
			int current=0;
			while(input.hasNext())
			{
				String line=input.nextLine();
				current=line.indexOf("https://movie.douban.com/subject",current);//爬网址
				while(current>0)
				{
					int endIndex=line.indexOf("playing_poster",current);//爬网址
					if(endIndex>0)
					{
						list.add(line.substring(current, endIndex));
						current=line.indexOf("http:", endIndex);
					}
					else
						current=-1;
				}
			}
			try {
				Thread.sleep(1600);
				getinformation g=new getinformation(urlString);
				g.GetIf();
			}
			catch(Exception ex)
			{
				System.out.println("Error: "+ex.getMessage());
			}
		}
		catch(Exception ex)
		{
			System.out.println("Error: "+ex.getMessage());
		}
		return list;
	}
}

class getinformation//获取电影信息
{
	private
	String urlString;
	final String title="(data-title=\".*\")";//匹配电影标题的正则表达式
	final String comment="(<span class=\"short\">[\\s\\S.]*)";
	final String genre="\"v:genre\">[\u4E00-\u9FA5]+";
	final String user=" <a href=\"https://www.douban.com/people/.*/\" class=\"\">.*</a>";
	final String shortnum="<a href=\"https://movie.douban.com/subject/.*/comments\\?status=P\">.*<";
	String temp2="0";//每个评论之前显示的标题
	getratingValue grv;
	getStaff gsf;
	public
	getinformation(String u)
	{
		urlString=u;
	}
	void GetIf()
	{
		try(
				BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
		)
		{
			URL url=new URL(urlString);
			Scanner input=new Scanner(url.openStream());
			//getratingValue(urlString);//获取评分等相关信息
		//	getStaff(urlString);
			Pattern titleP=Pattern.compile(title);
			Pattern genreP=Pattern.compile(genre);
			Pattern userP=Pattern.compile(user);
			Pattern shortnumP=Pattern.compile(shortnum);
			Pattern commentP=Pattern.compile(comment);
			while(input.hasNext())
			{
				String line=input.nextLine();
				Matcher titleM=titleP.matcher(line);
				Matcher shortnumM=shortnumP.matcher(line);
				Matcher commentM=commentP.matcher(line);
				Matcher genreM=genreP.matcher(line);
				Matcher userM=userP.matcher(line);
				if(titleM.find())
				{
					String temp=titleM.group(1);
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='《'||temp.charAt(i)=='\"')
						{
							temp2=temp.substring(i+1,temp.length()-1);
							break;
						}
					System.out.println(temp2);//匹配电影标题
					output.write(temp2+"\r\n");
					output.flush();
				}
				if(shortnumM.find())
				{
					String temp=shortnumM.group(0),temp1="0";
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='>')
							temp1=temp.substring(i+1,temp.length()-4);
					System.out.println("短评数量: "+temp1+"条");
					output.write("短评数量: "+temp1+"条"+"\r\n");
					output.flush();
				}
				if(userM.find())
				{
					String temp=userM.group(0),temp1="0";
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='>')
						{
							temp1=temp.substring(i,temp.length()-4);
							break;
						}	
					System.out.print(temp2+"用户: "+temp1);//匹配评论
					output.write(temp2+"用户: "+temp1);
					output.flush();
				}
				if(commentM.find())
				{
					String temp=commentM.group(1),temp1="0";
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='>')
						{
							temp1=temp.substring(i,temp.length()-7);
							break;
						}	
					System.out.println(" "+"---短评: "+temp1);//匹配评论
					output.write(" "+"---短评: "+temp1+"\r\n");
					output.flush();
				}
				if(genreM.find())
				{
					System.out.print("类型: ");
					output.write("类型: ");
					output.flush();
				}
				while(genreM.find())
				{
					String temp=genreM.group(0),temp1="0";
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='>')
						{
							temp1=temp.substring(i+1,temp.length());
							break;
						}
					System.out.print(temp1+'/'+" ");
					output.write(temp1+'/'+" ");
					output.flush();
				}
			}	
			try {
				Thread.sleep(1600);
				grv=new getratingValue(urlString);
				grv.GetRV();
			}
			catch(Exception ex)
			{
				System.out.println("Error: "+ex.getMessage());
			}
			System.out.println(temp2+"演员及导演");
			output.write(temp2+"演员及导演"+"\r\n");
			output.flush();
			try {
				Thread.sleep(1600);
				gsf=new getStaff(urlString);
				gsf.GetS();
			}
			catch(Exception ex)
			{
				System.out.println("Error: "+ex.getMessage());
			}
			//output.close();
		}
		catch(Exception ex)
		{
			System.out.println("Error: "+ex.getMessage());
		}
	}
}

class getratingValue//获取电影评分以及上映日期
{
	private
	String urlString;
	final String votes="\"v:votes\">\\d+";
	final String ratingValue="\"ratingValue\":.*";
	final String releasedate="content=\".*\\(中国大陆\\)\"";
	String rd="0",V="0";
	public
	getratingValue(String u)
	{
		urlString = u;
	}
	void GetRV()
	{
		try(
				BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
		) 
		{
			URL url=new URL(urlString);
			Scanner input=new Scanner(url.openStream());
			Pattern releasedateP=Pattern.compile(releasedate);
			Pattern votesP=Pattern.compile(votes);
			Pattern ratingValueP=Pattern.compile(ratingValue);
			while(input.hasNext())
			{
				String line=input.nextLine();
				Matcher votesM=votesP.matcher(line);
				Matcher releasedateM=releasedateP.matcher(line);
				Matcher ratingValueM=ratingValueP.matcher(line);
				if(votesM.find())
				{
					String temp=votesM.group(0);
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='>')
						{
							V=temp.substring(i+1,temp.length()-1);
							break;
						}
					System.out.println("评分人数: "+V+" ");
					output.write("评分人数: "+V+" "+"\r\n");
					output.flush();
				}
				if(ratingValueM.find())
				{
					System.out.print(ratingValueM.group(0));
					output.write(ratingValueM.group(0));
					output.flush();
				}
				if(releasedateM.find())
				{
					String temp=releasedateM.group(0);
					for(int i=0;i<temp.length();i++)
						if(temp.charAt(i)=='\"')
						{
							rd=temp.substring(i+1,temp.length()-1);
							break;
						}
						System.out.print("上映日期: "+rd);
						output.write("上映日期: "+rd);
						output.flush();
				}
			}
			//output.close();
		}
		catch(Exception ex)
		{
			System.out.println("Error: "+ex.getMessage());
		}
	}
}

class getStaff//获取电影人员
{
	private
	String urlString;
	final String actor="\"v:starring\">.*<";
	final String direct="\"v:directedBy\">.*<";
	public
	getStaff(String u)
	{
		urlString=u;
	}
	void GetS() 
	{
		try(
				BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));
			)
		{
			URL url=new URL(urlString);
			Scanner input=new Scanner(url.openStream());
			Pattern actorP=Pattern.compile(actor);
			Pattern directP=Pattern.compile(direct);
			while(input.hasNext())
			{
				String line=input.nextLine();
				Matcher actorM=actorP.matcher(line);
				Matcher directM=directP.matcher(line);
				if(actorM.find())
				{
					final String name="[\\u4e00-\\u9fa5|'·']+";
					String temp=actorM.group(0);
					Pattern nameP=Pattern.compile(name);
					Matcher nameM=nameP.matcher(temp);
					while(nameM.find())
					{
						System.out.println("主演: "+nameM.group(0));	
						output.write("主演: "+nameM.group(0)+"\r\n");
						output.flush();
					}
				}
				if(directM.find())
				{
					final String name="[\\u4e00-\\u9fa5|'·']+";
					String temp=directM.group(0);
					Pattern nameP=Pattern.compile(name);
					Matcher nameM=nameP.matcher(temp);
					while(nameM.find())
					{
						System.out.println("导演: "+nameM.group(0));	
						output.write("导演: "+nameM.group(0)+"\r\n");
						output.flush();
					}
				}
			}
			//output.close();
		}
		catch(Exception ex)
		{
			System.out.println("Error: "+ex.getMessage());
		}
	}
}

怕被豆瓣封,所以降低了一下爬虫的速率,所以爬的比较慢。

效果图:

抓取的评论,演员导演,题目和上映日期等信息,之后存到文件中。

BufferedWriter output = new BufferedWriter(new FileWriter("E:\\temp.txt",true));

output.write()和output.flush(),如果没有output.flush可能写文件的时候和控制台输出的顺序不一样。

顺便贴出来正则表达式的学习网站:www.runoob.com/java/java-regular-expressions.html

                                                          http://www.runoob.com/regexp/regexp-syntax.html

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值