爬虫课程笔记
B站爬虫
# _*_ coding: utf-8 _*_
import requests, re
from lxml import etree
class BlBl:
def __init__(self,url):
self.url = url
# 哔哩哔哩弹幕url
self.danmu_url= 'https://comment.bilibili.com/{}.xml'
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
def get_html(self,url):
"""发送请求,返回响应"""
return requests.get(url,headers = self.headers).content.decode()
def save_danmu(self, l,num):
"""保存弹幕"""
print('保存')
with open('./danmu/{}.txt'.format(num), 'a') as f:
for danmu_str in l:
print(danmu_str)
f.write(danmu_str)
f.write("\n")
def get_xml(self,li):
print(li)
for num in li: # 遍历cid列表,设置弹幕请求url
# 拼接弹幕url,调用函数,发送请求,获取结果
danmu_xml = self.get_html(self.danmu_url.format(num)).encode() # 解析时说有问题,encode()转为byte型
# 将获取到的xml类型转换为etree对象
xml_etr_obj = etree.HTML(danmu_xml)
# 获取弹幕列表
l = xml_etr_obj.xpath('//d/text()') # # 解析时说有问题,获取结果是用encode()转为byte型
print('准备保存')
self.save_danmu(l,num) # 保存
def run(self):
# 发送请求,获取结果
bl_html = self.get_html(self.url)
print('正则获取cid')
# 提取此套的所有网页url_地址和cid
li = re.findall(r"<option value='.*?' cid='(\d+)'>",bl_html)
if len(li) == 0: # 如果只有一个视频,上边这个列表为空,获取单个cid
li = re.findall(r"EmbedPlayer\('player',.*?cid=(\d+)&aid",bl_html)
# 请求xml的url并保存弹幕
self.get_xml(li)
if __name__ == '__main__':
url = 'https://www.bilibili.com/video/av18198653/'
bili = BlBl(url)
bili.run()
爬虫建议
动态HTML技术(了解)
Selenium和PhantomJS
Selenium
Selenium是一个Web的自动化测试工具,最初是为网站自动化测试而开发的,Selenium 可以直接运行在浏览器上,它支持所有主流的浏览器(包括PhantomJS这些无界面的浏览器),可以接收指令,让浏览器自动加载页面,获取需要的数据,甚至页面截屏
PhantomJS
PhantomJS 是一个基于Webkit的==“无界面”(headless)浏览器==,它会把网站加载到内存并执行页面上的 JavaScript
http://selenium-python-zh.readthedocs.io/en/latest/waits.html
selenium demo
# coding=utf-8
from selenium import webdriver
import time
#实例化一个浏览器
driver = webdriver.Chrome()
# driver = webdriver.PhantomJS()
#设置窗口大小
# driver.set_window_size(1920,1080)
#最大化窗口
driver.maximize_window()
#发送请求
driver.get("http://www.baidu.com")
#进行页面截屏
driver.save_screenshot("./baidu.png")
#元素定位的方法
driver.find_element_by_id("kw").send_keys("python")
driver.find_element_by_id("su").click()
# driver 获取html字符串
# print(driver.page_source) #浏览器中elements的内容
print(driver.current_url)
#driver获取cookie
# cookies = driver.get_cookies()
# print(cookies)
# print("*"*100)
# cookies = {i["name"]:i["value"] for i in cookies}
# print(cookies)
#退出浏览器
time.sleep(3)
driver.quit()
# coding=utf-8
from selenium import webdriver
driver = webdriver.Chrome()
# driver.get("http://neihanshequ.com/")
driver.get("https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&ch=&tn=baidu&bar=&wd=python&rn=&oq=&rsv_pq=87739988000939bf&rsv_t=b194dxdCny6hrJFXQrh4D6bavkKZwfpeT4s7j7V6AvGfiiAvTgxqGAvWbCM&rqlang=cn")
# ret1 = driver.find_elements_by_xpath("//ul[@id='detail-list']/li")
# # print(ret1)
# # print(ret1)
# for li in ret1:
# print(li.find_element_by_xpath(".//h1/p").text)
# print(li.find_element_by_xpath(".//a[@class='image share_url1']").get_attribute("href"))
#find_element_by_link_text
print(driver.find_element_by_link_text("下一页>").get_attribute("href"))
#find_element_by_partial_link_text文本中包含下一页的a标签
print(driver.find_element_by_partial_link_text("下一页").get_attribute("href"))
driver.quit()
邮箱登录
# coding=utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://mail.qq.com/")
#切换到iframe
driver.switch_to.frame("login_frame")
driver.find_element_by_id("u").send_keys("12312312312")
time.sleep(3)
driver.quit()
超时处理
# coding=utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get("https://www.bilibili.com/v/kichiku/mad/#/all/stow")
print(driver.find_element_by_xpath("//ul[@class='vd-list mod-2']/li//a[@class='title']").text)
#翻页
driver.find_element_by_xpath("//button[@class='nav-btn iconfont icon-arrowdown3']").click()
time.sleep(3)
print(driver.find_element_by_xpath("//ul[@class='vd-list mod-2']/li//a[@class='title']").text)
driver.quit()
入门
# 加载网页:
from selenium import webdriver
driver = webdriver.PhantomJS(“c:…/pantomjs.exe”)
driver.get("http://www.baidu.com/")
driver.save_screenshot("长城.png")
# 定位和操作:
driver.find_element_by_id(“kw”).send_keys(“长城”)
driver.find_element_by_id("su").click()
# 查看请求信息:
driver.page_source
driver.get_cookies()
driver.current_url
# 退出
driver.close() #退出当前页面
driver.quit() #退出浏览器
用法:
find_element_by_id (返回一个)
find_elements_by_xpath (返回一个列表)
find_elements_by_link_text
find_elements_by_partial_link_text
find_elements_by_tag_name
find_elements_by_class_name
find_elements_by_css_selector
注意点:
find_element 和find_elements的区别:返回一个和返回一个列表
by_link_text和by_partial_link_text的区别:全部文本和包含某个文本
by_css_selector的用法: #food span.dairy.aged
by_xpath中获取属性和文本需要使用get_attribute() 和.text
Cookie相关用法:
{cookie[‘name’]: cookie[‘value’] for cookie in driver.get_cookies()}
driver.delete_cookie("CookieName")
driver.delete_all_cookies()
页面等待
为什么需要等待
如果网站采用了动态html技术,那么页面上的部分元素出现时间便不能确定,这个时候就可以设置一个等待时间,强制要求在时间内出现,否则报错
强制等待
time.sleep(10)
显式等待(了解)
显式等待指定某个条件,然后设置最长等待时间。如果在这个时间还没有找到元素,那么便会抛出异常了。
WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "myDynamicElement"))
隐式等待(了解)
就是简单地设置一个最大等待时间,单位为秒。
driver.implicitly_wait(10)
chromedirver下载地址:
https://npm.taobao.org/mirrors/chromedriver
phantomjs下载地址:
http://phantomjs.org/download.html
模拟登录豆瓣
# coding=utf-8
from selenium import webdriver
import time
import requests
from yundama.dama import indetify
#实例化driver
driver = webdriver.Chrome()
driver.get("https://www.douban.com/")
driver.find_element_by_id("form_email").send_keys("784542623@qq.com")
driver.find_element_by_id("form_password").send_keys("zhoudawei123")
#识别验证码
captcha_image_url = driver.find_element_by_id("captcha_image").get_attribute("src")
captcha_content = requests.get(captcha_image_url).content
captcha_code = indetify(captcha_content)
print("验证码的识别结果为:",captcha_code)
#输入验证码
driver.find_element_by_id("captcha_field").send_keys(captcha_code)
driver.find_element_by_class_name("bn-submit").click()
#获取cookie
cookies = {i["name"]:i["value"] for i in driver.get_cookies()}
print(cookies)
time.sleep(3)
driver.quit()
云打码
第三方验证码识别平台
注册付费后购买 并设置
云打码黄了可以尝试
http://www.chaojiying.com/api-14.html
# coding=utf-8
from selenium import webdriver
import time
import requests
from yundama.dama import indetify
#实例化driver
driver = webdriver.Chrome()
driver.get("https://www.douban.com/")
driver.find_element_by_id("form_email").send_keys("784542623@qq.com")
driver.find_element_by_id("form_password").send_keys("zhoudawei123")
#识别验证码
captcha_image_url = driver.find_element_by_id("captcha_image").get_attribute("src")
captcha_content = requests.get(captcha_image_url).content
captcha_code = indetify(captcha_content)
print("验证码的识别结果为:",captcha_code)
#输入验证码
driver.find_element_by_id("captcha_field").send_keys(captcha_code)
driver.find_element_by_class_name("bn-submit").click()
#获取cookie
cookies = {i["name"]:i["value"] for i in driver.get_cookies()}
print(cookies)
time.sleep(3)
driver.quit()
dy爬虫
爬取斗鱼直播平台的所有房间信息
https://www.douyu.com/directory/all
# coding=utf-8
from selenium import webdriver
import time
class DouyuSpider:
def __init__(self):
self.start_url = "https://www.douyu.com/directory/all"
self.driver = webdriver.Chrome()
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath("//ul[@id='live-list-contentbox']/li")
content_list = []
for li in li_list:
item = {}
item["room_img"]=li.find_element_by_xpath(".//span[@class='imgbox']/img").get_attribute("src")
item["room_title"] = li.find_element_by_xpath("./a").get_attribute("title")
item["room_cate"] = li.find_element_by_xpath(".//span[@class='tag ellipsis']").text
item["anchor_name"] = li.find_element_by_xpath(".//span[@class='dy-name ellipsis fl']").text
item["watch_num"] = li.find_element_by_xpath(".//span[@class='dy-num fr']").text
print(item)
content_list.append(item)
#获取下一页的元素
next_url = self.driver.find_elements_by_xpath("//a[@class='shark-pager-next']")
next_url = next_url[0] if len(next_url)>0 else None
return content_list,next_url
def save_content_list(self,content_list):
pass
def run(self):#实现主要逻辑
#1.start_url
#2.发送请求,获取响应
self.driver.get(self.start_url)
#3.提取数据,提取下一页的元素
content_list,next_url = self.get_content_list()
#4.保存数据
self.save_content_list(content_list)
#5.点击下一页元素,循环
while next_url is not None:
next_url.click()
time.sleep(3)
content_list,next_url = self.get_content_list()
self.save_content_list(content_list)
if __name__ == '__main__':
douyu = DouyuSpider()
douyu.run()
Tesseract
定义:
Tesseract是一个将图像翻译成文字的OCR库(光学文字识别,Optical Character Recognition)
安装:
sudo apt-get install tesseract-ocr
在python中调用Tesseract
pip install pytesseract
在终端中:
tesseract test.jpg text
在python代码中
import pytesseract
from PIL import Image
image = Image.open(jpg)
pytesseract.image_to_string(image)
ZH爬虫
# coding=utf-8
import requests
from lxml import etree
from dama import indetify
import time
class ZhihuSpider:
def __init__(self):
self.session = requests.Session()
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
def login(self):
html = self.session.get('https://www.zhihu.com/#signin', headers=self.headers).content
html = etree.HTML(html)
_xsrf = html.xpath("//input[@name='_xsrf']/@value")[0]
captcha_url = 'https://www.zhihu.com/captcha.gif?r=%d&type=login' % (time.time() * 1000)
captcha_response = self.session.get(captcha_url,headers=self.headers)
with open("temp.gif","wb") as f:
f.write(captcha_response.content)
captcha = indetify(captcha_response.content)
post_data = {
"_xsrf": _xsrf,
"email": "***",
"password": "***",
"remember_me": True,
"captcha": captcha
}
print(post_data)
resp = self.session.post('https://www.zhihu.com/login/email', data=post_data, headers=self.headers)
print(resp.status_code)
print(resp.content.decode())
resposne = self.session.get("https://www.zhihu.com/",headers=self.headers)
with open("login.html","w",encoding="utf-8") as f:
f.write(resposne.content.decode())
if __name__ == '__main__':
zhihu = ZhihuSpider()
zhihu.login()
重点
#### 验证码的识别
- url不变,验证码不变
- 请求验证码的地址,获得相应,识别
- url不变,验证码会变
- 思路:对方服务器返回验证码的时候,会和每个用户的信息和验证码进行一个对应,之后,在用户发送post请求的时候,会对比post请求中法的验证码和当前用户真正的存储在服务器端的验证码是否相同
- 1.实例化session
- 2.使用seesion请求登录页面,获取验证码的地址
- 3.使用session请求验证码,识别
- 4.使用session发送post请求’
- 使用selenium登录,遇到验证码
- url不变,验证码不变,同上
- url不变,验证码会变
- 1.selenium请求登录页面,同时拿到验证码的地址
- 2.获取登录页面中driver中的cookie,交给requests模块发送验证码的请求,识别
- 3.输入验证码,点击登录
### selenium使用的注意点
- 获取文本和获取属性
- 先定位到元素,然后调用`.text`或者`get_attribute`方法来去
- selenium获取的页面数据是浏览器中elements的内容
- find_element和find_elements的区别
- find_element返回一个element,如果没有会报错
- find_elements返回一个列表,没有就是空列表
- 在判断是否有下一页的时候,使用find_elements来根据结果的列表长度来判断
- 如果页面中含有iframe、frame,需要先调用driver.switch_to.frame的方法切换到frame中才能定位元素
- selenium请求第一页的时候回等待页面加载完了之后在获取数据,但是在点击翻页之后,hi直接获取数据,此时可能会报错,因为数据还没有加载出来,需要time.sleep(3)
- selenium中find_element_by_class_name智能接收一个class对应的一个值,不能传入多个
db.stu.aggregate({$group:{_id:"$name",counter:{$sum:2}}})
db.stu.aggregate({$group:{_id:null,counter:{$sum:1}}})
db.stu.aggregate({$group:{_id:"$gender",name:{$push:"$name"}}})
db.stu.aggregate({$group:{_id:"$gender",name:{$push:"$$ROOT"}}})
db.tv3.aggregate(
{$group:{_id:{"country":"$country",province:"$province",userid:"$userid"}}},
{$group:{_id:{country:"$_id.country",province:"$_id.province"},count:{$sum:1}}},
{$project:{country:"$_id.country",province:"$_id.province",count:"$count",_id:0}}
)
db.stu.aggregate(
{$match:{age:{$gt:20}}},
{$group:{_id:"$gender",count:{$sum:1}}}
)
db.t2.aggregate(
{$unwind:"$size"}
)
db.t3.aggregate(
{$unwind:"$tags"},
{$group:{_id:null,count:{$sum:1}}}
)
db.t3.aggregate(
{$unwind:{path:"$size",preserveNullAndEmptyArrays:true}}
)