C# 爬虫学习之猫眼电影

C# 爬虫学习之猫眼电影(完整代码见最后)

1、HTTP部分

1.1 引用

using System;
using System.Net;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Configuration;
using System.Diagnostics;
using System.IO;

1.2 Get方法

public class HTTP
	{	
		public static string GET(string url, string cookies = null, int timeout = 5000)
		{
			HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
			request.Method = "GET";
			request.ContentType = "text/html;charset=UTF-8";
			request.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
			request.Timeout = timeout;

			if (cookies != null) 
			{
				request.Headers.Add("Cookie", cookies);
			}

			HttpWebResponse response = (HttpWebResponse)request.GetResponse();
			Stream myResponseStream = response.GetResponseStream();
			StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
			string retString = myStreamReader.ReadToEnd();
			myStreamReader.Close();
			myResponseStream.Close();

			return retString;
		}
	}

2、数据抓取部分

2.1 步骤

  • 寻找如何提交所属城市信息
页面cookies中有个ci=1属性,值为城市ID,ci=1即为北京。把ci属性通过cookies提交上去即可设置城市
  • 发送Get请求
string result = HTTP.GET("http://maoyan.com/cinemas?offset=0",this.ci);
result = result.Replace("\n", "").Replace(" ","").Replace("\t","").Replace("\r","");
  • 获取到页面HTML数据后通过正则取出所需到数据,C#使用正则需要添加引用
using System.Text.RegularExpressions;
  • 取城市名
if (this.cityName == null) 
{
	Match cityName = Regex.Match(this.html,"<divclass=\"city-name\">([\\S]+?)<spanclass=\"caret\">");
	this.cityName = cityName.Groups [1].Value;
}
  • 取出影院名和地址
// 开始取影院信息 [3]=名字 	[5]=地址
MatchCollection s = Regex.Matches(this.html,"<divclass=\"cinema-info\"><ahref=\"([\\S]+?)\"class.*?(}\\\">([\\S]+?)</a>).*?(地址:([\\S]+?))</p></div>.*?</div>");

for (int i = 0; i < s.Count; i++) 
{	
	this.cinemaCount++;
	Console.WriteLine (String.Format ("------第{0}家影院------", this.cinemaCount));
	Console.WriteLine (String.Format ("名字:{0}", s[i].Groups[3].Value));
	Console.WriteLine (String.Format ("地址:{0}", s[i].Groups[5].Value));
}
  • 取页面数
// 取页数并输出数据
MatchCollection pageNumber = Regex.Matches(this.html,"offset=\\d+\">(\\d+)</a>");

if (pageNumber.Count > 1) 
{
	this.pageCount = Int32.Parse (pageNumber [pageNumber.Count - 1].Groups [pageNumber [pageNumber.Count - 1].Groups.Count - 1].Value);

	for (int m = 1; m <= this.pageCount - 1; m++) 
	{
		string pageUrl = String.Format ("http://maoyan.com/cinemas?offset={0}", m * 12);

		string pageData = HTTP.GET (pageUrl, this.ci);
		pageData = pageData.Replace ("\n", "").Replace (" ", "").Replace ("\t", "").Replace ("\r", "");

		this.html = pageData;
		this.getCinemaData ();
	}
}

2.2 结语

至此就可以把猫眼电影单个城市所有电影院名和地址全部取出了

完整代码

using System;
using System.Text.RegularExpressions;

namespace maoyan
{
	public class Cinema
	{	
		private bool isReady = false;
		private bool isStart = false;
		public string cityName = null;
		private string html;
		public int cinemaCount = 0;
		public int pageCount = 0;
		public string ci;

		public void Ready (string ci)
		{	
			if (this.isReady)
				return;

			this.html = "";
			this.cinemaCount = 0;
			this.cityName = null;
			this.pageCount = 0;


			// 当前城市第一页开始
			string result = HTTP.GET("http://maoyan.com/cinemas?offset=0",this.ci);
			result = result.Replace("\n", "").Replace(" ","").Replace("\t","").Replace("\r","");

			this.ci = ci;
			this.html = result;
			this.isReady = true;
		}

		public void Start ()
		{
			if (this.html.Length <= 0 || this.isStart)
				return;
			this.isStart = true;

			// 取城市名
			this.getCityName ();
			Console.WriteLine ("当前城市:{0}",this.cityName);
			// 取页面数据
			this.getCinemaData ();
			// 取页面数
			//this.getPageCount ();
			this.isReady = false;
			this.isStart = false;
		}

		public void getCityName ()
		{
			if (this.cityName == null) {
				Match cityName = Regex.Match(this.html,"<divclass=\"city-name\">([\\S]+?)<spanclass=\"caret\">");
				this.cityName = cityName.Groups [1].Value;
			}
		}

		public void getCinemaData ()
		{
			// 开始取影院信息 [3]=名字 	[5]=地址
			MatchCollection s = Regex.Matches(this.html,"<divclass=\"cinema-info\"><ahref=\"([\\S]+?)\"class.*?(}\\\">([\\S]+?)</a>).*?(地址:([\\S]+?))</p></div>.*?</div>");

			for (int i = 0; i < s.Count; i++) 
			{	
				this.cinemaCount++;
				Console.WriteLine (String.Format ("------第{0}家影院------", this.cinemaCount));
				Console.WriteLine (String.Format ("名字:{0}", s[i].Groups[3].Value));
				Console.WriteLine (String.Format ("地址:{0}", s[i].Groups[5].Value));
			}
		}

		public void getPageCount ()
		{
			// 取页数
			MatchCollection pageNumber = Regex.Matches(this.html,"offset=\\d+\">(\\d+)</a>");

			if (pageNumber.Count > 1) {
				this.pageCount = Int32.Parse (pageNumber [pageNumber.Count - 1].Groups [pageNumber [pageNumber.Count - 1].Groups.Count - 1].Value);

				for (int m = 1; m <= this.pageCount - 1; m++) {
					string pageUrl = String.Format ("http://maoyan.com/cinemas?offset={0}", m * 12);

					string pageData = HTTP.GET (pageUrl, this.ci);
					pageData = pageData.Replace ("\n", "").Replace (" ", "").Replace ("\t", "").Replace ("\r", "");

					this.html = pageData;
					this.getCinemaData ();
				}
			}
		}
	}
}

Main函数

using System;

namespace maoyan
{
	class MainClass
	{	
		public static void Main (string[] args)
		{
			Console.WriteLine ("------------开始------------");

			Cinema mc = new Cinema ();
			string ci = "ci=1";
			mc.Ready (ci);
			mc.Start ();

			Console.WriteLine ("------------结束------------");
		}
	}
}
  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值