模拟登录,抓取改版之后的新浪微博粉丝:昵称,头像,ID,评论。初次利用了pandas写入文件。/20171218
# 新浪微博的反扒措施:ajax加载,携带root,时间戳等参数“过于真实”的url将不显示全部信息,
# 所以从xhr中找到要请求的url后,还要将不需要的参数从url中删去,否则就中计啦啦啦
# 记得睡眠啊,而且设置成随机睡眠,不然被封号就是永久的!/20171218
# coding: utf-8
from fake_useragent import UserAgent
import re
import requests
import pandas # 没接触pandas之前,一条一条的往csv里写……
import time
import random
def get_one_page(url):
html = requests.get(url,headers = headers,cookies = cookies)
html_return = html.json()['data']['html'] # html.text无返回,说明不是text格式的
#print(html_return)
return html_return
def parse_one_page(html_return):
pattern = re.compile(r'com.(\d+)"><img alt="(.+?)" src="(.+?)" usercard="(.+?)"></a>.*?</a>:(.+?)</div>',re.S)
data = re.findall(pattern,html_return)
#print(data)
return data
def write_to_file(data):
data_to_write = pandas.DataFrame(data)
data_to_write.to_csv('test.csv',header = False,index = False,mode = 'a+') # 去掉表头行和索引列
def main(i):
url = 'https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4185536291212647&page='+str(i)+'&filter=hot&filter_tips_before=0&from=singleWeiBo'
html_return = get_one_page(url) # 去掉了不必要的参数后的url
data = parse_one_page(html_return)
write_to_file(data)
headers = {'User-Agent': UserAgent().random}
cookies = {'Cookie':'balabala'}
if __name__=='__main__':
for i in range(1,10):
main(i)
time.sleep(random.uniform(2,6))