目标:爬取微博内容并写入CSV
进度:基本达成目标,但效果有限,所以这版本定为第一版,后面继续完善
存在的疑问:
- 是没有用代理池,然后爬一个人很顺利的,然后到后面的人都爬取不了
- json转CSV只能通过字典,但字典
- 微博多个标签重复不知道怎么解决
还没解决的技术性问题:
- 触发了微博的反爬虫机制
- 试一下用微博接口读取
- 还有视频,转发文章,点开全文没有爬取到
- 标签下面还有很多其他标签,看看怎么写会简化一点
下一步目标:
微信文章的爬取
from pyquery import PyQuery as pq
import requests
import csv
import urllib.request
import re
import urllib.parse
import urllib
def getPage(page):
url = 'https://m.weibo.cn/api/container/getIndex?'#在怎么来?分析Ajax请求
hd = {"User-Agent": 'Mozilla'} # 模仿浏览器
params = {'type': 'uid',
'value': '2492465520',
'containerid': '1076032492465520',
'page': page} # 作为参数增加到url中去
try:
r = requests.get(url, headers=hd, params=params)
r.raise_for_status()
return r.json() # 解析为JSON返回
except:
print('-----')
def parsePage(json):
if json:
items = json.get('data').get('cards') # 是一个列表
for item in items:
item = item.get('mblog') # 是一个字典
if item == None:
continue
content = {}
content['data'] = item.get('created_at') # 日期
content['text'] = pq(item.get('text')).text() # 借助pyquery将正文中的HTML标签去掉了
content['source'] = item.get('source') # 用什么发表
content['video'] = item.get('isLongText') # 是否长文
content['picture'] = item.get('bmiddle_pic')# 微博照片
content['text22'] = item.get('retweeted_status')# 转发的原文
content['idaa'] = item.get('id')# 文章ID
content['video22'] = item.get('page_info')# 视频的
yield content
def main():
for i in range(1, 20 ): # 爬取到第十页结束
json = getPage(i)
results = parsePage(json)
for result in results:
shorttext = result['text']
data = result['data']
source = result['source']
isLongText = result['video']
picture = result['picture']
text22 = result['text22']# 转的没有放上去
idbb = result['idaa']
viedo = result['video22']# 视频的没有放上去
sk = str(shorttext)
print(viedo)
with open('kkk.csv', 'a',encoding='gb18030',newline='') as csvfile:
fieldnames = ['idbb','data','shorttext','isLongText','picture']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'idbb':idbb,'data': data,'shorttext':sk,'isLongText':isLongText,'picture':picture})
main()
from pyquery import PyQuery as pq
import requests
import csv
import urllib.request
import re
import urllib.parse
import urllib
def getPage(page):
url = 'https://m.weibo.cn/api/container/getIndex?'#在怎么来?分析Ajax请求
hd = {"User-Agent": 'Mozilla'} # 模仿浏览器
params = {'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'page': page} # 作为参数增加到url中去
try:
r = requests.get(url, headers=hd, params=params)
r.raise_for_status()
return r.json() # 解析为JSON返回
except:
print('-----')
def parsePage(json):
if json:
items = json.get('data').get('cards') # 是一个列表
for item in items:
item = item.get('mblog') # 是一个字典
if item == None:
continue
content = {}
content['data'] = item.get('created_at') # 日期
content['text'] = pq(item.get('text')).text() # 借助pyquery将正文中的HTML标签去掉了
content['source'] = item.get('source') # 用什么发表
content['video'] = item.get('isLongText') # 是否长文
content['picture'] = item.get('bmiddle_pic')# 微博照片
content['text22'] = item.get('retweeted_status')# 转发的原文
content['idaa'] = item.get('id')# 文章ID
content['video22'] = item.get('page_info')# 视频的
yield content
def main():
for i in range(1, 3 ): # 爬取到第十页结束
json = getPage(i)
results = parsePage(json)
for result in results:
kkl = result['video']
mmee = str(kkl)
print(mmee)
if mmee == 'True':
ssskkkbb = 'https://m.weibo.cn/statuses/extend?id='+result['idaa']
response = requests.get(ssskkkbb)
mmde = response.text
mmke = mmde.encode('gb18030').decode('unicode_escape')
shorttext = mmke
else:
shorttext = result['text']
data = result['data']
source = result['source']
isLongText = result['video']
picture = result['picture']
text22 = result['text22']# 转的没有放上去
idbb = result['idaa']
viedo = result['video22']# 视频的没有放上去
sk = str(shorttext)
mkk = str(isLongText)
print(mmee+sk)
with open('kkk.csv', 'a',encoding='gb18030',newline='') as csvfile:
fieldnames = ['idbb','data','shorttext','isLongText','picture']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow({'idbb':idbb,'data': data,'shorttext':sk,'isLongText':isLongText,'picture':picture})
main()