简单爬了爬微博

首先看看第一版的:利用固定的网址发送请求处理获取对应数据

到网站中找到对应的链接,从链接中提取数据即可
1

import requests
import re
from bs4 import BeautifulSoup


class WeiBo():
	def __init__(self):
		self.url = "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0"
		self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
    }

	def get_json_data(self):
		"""
		访问网站,获取对应的sources,并且将获取到的html转化成json文件
		"""
		try:
			response = requests.get(self.url,headers=self.header)
			response.raise_for_status()
		except Exception:
			print("Error")
			exit(-1)
		else:
			return response.json()


	def analyse(self,json):
		"""
		 从下载的json中提取需要的内容
		"""
		cards = json.get("data").get("cards")
		card_data_list = []
		for card in cards:
			mblog = card.get("mblog")
			card_data ={
			"Link":card.get("scheme"),
			"点赞数":mblog.get("attitudes_count"),
			"评论数": mblog.get("comments_count"),
			"转发数": mblog.get("reposts_count"),
			"来源": mblog.get("source"),
			"内容": self.get_html_content(card.get("scheme")),
			"编辑时间":mblog.get("created_at"),
			}
			card_data_list.append(card_data)
		return card_data_list

		
	def get_html_content(self,url):
		"""
		从爬取到的对应的网页中获取到微博推送全面的内容
		因为如果内容稍长,部分内容无法显示
		"""
		try:
			response = requests.get(url,headers=self.header)
			response.raise_for_status()
			pattern = re.compile("\"text\": >.*?<,")
		except Exception as e:
			print(e)
		else:
			html = response.text
			pattern = re.compile("\"status\":(.*?)\"hotScheme\"", re.S)

			text_search = re.search(pattern, html).group(1).rstrip()[:-1]
			import json
			content = json.loads(text_search)
			text = content.get("text")
			soup = BeautifulSoup(text, "lxml")
			content = soup.get_text()
			return content

	def run(self):
		"""
		综合,简单提供爬取
		"""
		json = self.get_json_data()
		# print(json)
		data = self.analyse(json)
		for i in data:
			for key, value in i.items():
				print(f"{key}:{value}")
			print("--"*20)

if __name__ == "__main__":
	test = WeiBo()
	test.run()

修改 2019-10-17 22:34, 本来认为现在可以实现批量爬取,但是实际运行的时候不断出现重复内容

import requests
import re
from bs4 import BeautifulSoup


class WeiBo():
	def __init__(self):
		self.url = "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0"
		self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
    }

	def get_json_data(self,url):
		"""
		访问网站,获取对应的sources,并且将获取到的html转化成json文件
		"""
		try:
			response = requests.get(url,headers=self.header)
			response.raise_for_status()
		except Exception:
			print("Error")
			exit(-1)
		else:
			return response.json()


	def analyse(self,json):
		"""
		 从下载的json中提取需要的内容
		"""
		cards = json.get("data").get("cards")
		card_data_list = []
		for card in cards:
			mblog = card.get("mblog")
			card_data ={
			"Link":card.get("scheme"),
			"点赞数":mblog.get("attitudes_count"),
			"评论数": mblog.get("comments_count"),
			"转发数": mblog.get("reposts_count"),
			"来源": mblog.get("source"),
			"内容": self.get_html_content(card.get("scheme")),
			"编辑时间":mblog.get("created_at"),
			}
			card_data_list.append(card_data)
		return card_data_list

		
	def get_html_content(self,url):
		"""
		从爬取到的对应的网页中获取到微博推送全面的内容
		因为如果内容稍长,部分内容无法显示
		"""
		try:
			response = requests.get(url,headers=self.header)
			response.raise_for_status()
			pattern = re.compile("\"text\": >.*?<,")
		except Exception as e:
			print(e)
		else:
			html = response.text
			pattern = re.compile("\"status\":(.*?)\"hotScheme\"", re.S)

			text_search = re.search(pattern, html).group(1).rstrip()[:-1]
			import json
			content = json.loads(text_search)
			text = content.get("text")
			soup = BeautifulSoup(text, "lxml")
			content = soup.get_text()
			return content

	def run(self,url):
		"""
		综合,简单提供爬取
		"""
		json = self.get_json_data(url)
		data = self.analyse(json)
		for i in data:
			for key, value in i.items():
				print(f"{key}:{value}")
			print("--"*20)

if __name__ == "__main__":
	test = WeiBo()
	i = 1
	url_format = "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0&since_id={}"
	
	while True:
		print("--------------正在爬取第{}页--------------".format(i))
		try:
			url = url_format.format(i)
			test.run(url)
		except Exception:
			url = url_format.format(i+1)
			text.run(url)
		finally:
			i+=1

上述网页实际上如果自己打开,内容又是不同的,在不懂之后加入Cookie,问题迎刃而解
修改时间running 之后: 2019-10-17 23:20

import requests
import re
from bs4 import BeautifulSoup


class WeiBo():
	def __init__(self):
		self.url = "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0"
		self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
    "X-Requested-With": "XMLHttpRequest",
    "Cookie": "_T_WM=27523679371; MLOGIN=0; WEIBOCN_FROM=1110006030; XSRF-TOKEN=602d2a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D102803%26uicode%3D10000011%26fid%3D102803"
    }

	def get_json_data(self,url):
		"""
		访问网站,获取对应的sources,并且将获取到的html转化成json文件
		"""
		try:
			response = requests.get(url,headers=self.header)
			response.raise_for_status()
		except Exception:
			print("Error")
			exit(-1)
		else:
			return response.json()


	def analyse(self,json):
		"""
		 从下载的json中提取需要的内容
		"""
		cards = json.get("data").get("cards")
		card_data_list = []
		for card in cards:
			mblog = card.get("mblog")
			card_data ={
			"Link":card.get("scheme"),
			"点赞数":mblog.get("attitudes_count"),
			"评论数": mblog.get("comments_count"),
			"转发数": mblog.get("reposts_count"),
			"来源": mblog.get("source"),
			"内容": self.get_html_content(card.get("scheme")),
			"编辑时间":mblog.get("created_at"),
			}
			card_data_list.append(card_data)
		return card_data_list

		
	def get_html_content(self,url):
		"""
		从爬取到的对应的网页中获取到微博推送全面的内容
		因为如果内容稍长,部分内容无法显示
		"""
		try:
			response = requests.get(url,headers=self.header)
			response.raise_for_status()
			pattern = re.compile("\"text\": >.*?<,")
		except Exception as e:
			pass
		else:
			html = response.text
			pattern = re.compile("\"status\":(.*?)\"hotScheme\"", re.S)

			text_search = re.search(pattern, html).group(1).rstrip()[:-1]
			import json
			content = json.loads(text_search)
			text = content.get("text")
			soup = BeautifulSoup(text, "lxml")
			content = soup.get_text()
			return content

	def spider(self,url, file):
		"""
		综合,简单提供爬取
		"""
		json = self.get_json_data(url)
		data = self.analyse(json)
		for i in data:
			for key, value in i.items():
				data = f"{key}:{value}"
				file.write(data+"\n")
				print(data)
			split = "--"*20
			print(split)
			file.write(split+"\n")

	def run(self):
		"""
		爬取多个界面
		"""
		file = open("data.txt", "w",encoding='utf-8')
		i = 1
		url_format = "https://m.weibo.cn/api/container/getIndex?containerid=102803&openApp=0&since_id={}"
		while True:
			prompt = "--------------正在爬取第{}页--------------".format(i)
			file.write(prompt+"\n")
			try:
				url = url_format.format(i)
				test.spider(url,file)
			except Exception:
				pass
			finally:
				i+=1
		file.close()

if __name__ == "__main__":
	test = WeiBo()
	test.run()

实际操作的时候注意设置一下休眠时间,否则可能403
另外需要将cookie改为自己的cookie,并且需要间隔性的更换
倘若有条件可以提供一个cookie池,否则爬取一段时间之后内容无法爬取

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值