今天在玩微博的时候,无意中想到利用python爬取一下,随便下载几张图片看看。但是,发现微博跟普通的网站不一样,
想获取一点内容,都得先从浏览器上获得cookie。不然都无法自动跳转。目前,我还没有找到好的解决办法,所以以后发现了
再更新,现在就先马马虎虎用一会儿先。
#coding=utf-8
#微博
#下载几张图片
#
from lxml import etree
import requests
import re
import os
#爬虫头部
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Cookie' : 'UM_distinctid=15edca346a9aa5-0e9dbf8c2d480e-c303767-1fa400-15edca346aa7f1; SINAGLOBAL=6419057313447.329.1506940241879; SUB=_2AkMtGX-odcPxrAZXmvkQz2ngaYlH-jyezBZeAn7uJhMyAxgv7gcrqSdutBF-XKGzh9qGX4VXKOXpwXgdx6xYmTh9; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9W5Z6K5XQdMeqjmQKwInwfgd5JpVF02N1hzc1h-Eeo5f; UOR=www.baidu.com,vdisk.weibo.com,www.baidu.com; login_sid_t=2005b825ef14960898222c97937bfee8; cross_origin_proto=SSL; YF-Ugrow-G0=5b31332af1361e117ff29bb32e4d8439; YF-V5-G0=69afb7c26160eb8b724e8855d7b705c6; _s_tentry=www.baidu.com; Apache=2728703623251.8203.1516764166064; ULV=1516764166071:35:7:1:2728703623251.8203.1516764166064:1516351741102'
}
session = requests.Session()
r = session.get(url='https://weibo.com/',headers=headers)
print(r.status_code) #获取返回状态
#print(r.text.encode('utf-8')) #打印解码后的返回数据
#xpath解析
xpath_html = etree.HTML(r.text.encode('utf-8'))
src_list = xpath_html.xpath("//script")
#保存url
photo_album = []
for s in src_list:
try:
# print s.text.encode('utf-8')
text = s.text.encode('utf-8')
data = re.findall(r'<img.*?src=(.*?) .*?>',text)
print len(data)
if data:
for d in data:
if d:
photo_album.append(d.replace('\\',""))
print d.replace('\\',"")
except:
pass
# print src_list
# print len(src_list)
i= 0
for e in photo_album:
#对url整理一下
e = e.replace('"','')
print e
try:
#获得远程图片内容
pic = requests.get(e,timeout=10)
except:
print '图片无法下载'
continue
#检查图片保存目录是否已经新建
cwd = os.getcwd()
store_dir = os.path.join(cwd,'download')
if not os.path.isdir(store_dir):
os.mkdir(store_dir)
#文件下载
string = os.path.join(cwd,'download',str(i) + '.' + e.split('.')[-1])
try:
fp = open(string,'wb')
fp.write(pic.content)
fp.close()
except:
continue
i += 1
#微博异步加载的api
#更改它的参数获得更多的内容。
#不过需要注意的是,微博返回的数据是直接写在<script>标签内的。所以得拿到script标签后,对script标签中的文本(即用正则表达式),不能用xpath等工具直接解析。
#https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=0&page=2&lefnav=0&__rnd=1516764168254
# r = requests.get(url='https://weibo.com/a/aj/transform/loadingmoreunlogin?ajwvr=6&category=0&page=2&lefnav=0&__rnd=1516764168254/')
# print(r.status_code)
# print(r.text.encode('utf-8'))