爬虫的概念
爬虫就是:模拟浏览器发送网络请求,获取请求响应
爬虫的流程
url – > 发送请求,获取响应---->提取数据—》保存
获取响应 --》提取url地址,继续请求
浏览器的请求过程
浏览器获取的数据包含:url地址对应的响应+js+css+jpg
爬虫会获取:url地址对应的响应
爬虫获取的内容和elements内容不一样,进行数据提取的时候,需要根据url地址对应的响应为准
http的重点请求头
- user-agent:告诉对方服务器是什么客户端正在请求资源,爬虫中模拟浏览器非常重要的一个手段
- cookie:获取登录之后才能够访问的资源
python2和python3中的字符串
ascii 一个字节表示一个字符
unicode 两个字节表示一个字符
utf-8 变长的编码方式,1,2,3字节表示一个字符
-
python2
- 字节类型:str,字节类型,通过decode()转化为unicode类型
- unicode类型:unicode,通过encode()转化为str字节类型
-
python3
- str:unicode,通过encode()转化为bytes
- bytes:字节类型,通过decode()转化为str类型
发送带header的请求
- 为什么请求需要带上header?
模拟浏览器,欺骗服务器,获取和浏览器一样的内容 - header的形式:字典
request.get(url,headers=headers)
发送带参数的请求
- 参数的形式:字典
kw = {'wd':'长城'}
- 用法
requests.get(url,params=kw)
- 关于参数的注意点
在url地址中,很多参数是没有用的,比如百度搜索的url地址,其中参数只有一个字段有用,其他的都可以删除
对应的,在后续的爬虫中,用到很多参数的url地址,都可以尝试删除参数
# coding=utf-8
import requests
query_string = input(":")
params = {"wd": query_string}
# url = "https://www.baidu.com/s?"
# url = "https://www.baidu.com/s?wd=%s" % query_string
url = "https://www.baidu.com/s?wd={}".format(query_string)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
# response = requests.get(url, params=params, headers=headers)
response = requests.get(url, headers=headers)
print(response.status_code)
print(response.request.url)
贴吧
import requests
class TiebaSpider:
def __init__(self, tieba_name):
self.teiba_name = tieba_name
self.urltemp = "http://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
def get_url_list(self):
return [self.urltemp.format(i * 50) for i in range(5)]
def parse_url(self, url): # 发送请求,获取响应
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def sava_html_str(self, html_str, page_num):
file_path = "{}——第{}页.html".format(self.teiba_name, page_num)
with open(file_path, "w", encoding='utf-8') as f:
f.write(html_str)
print("保存成功")
def run(self): # 实现主要逻辑
# 1.构造url列表
url_list = self.get_url_list()
# 2.发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 3.保存
page_num = url_list.index(url) + 1
self.sava_html_str(html_str, page_num)
if __name__ == '__main__':
tieba_spider = TiebaSpider('李毅')
tieba_spider.run()
发送post请求
data = {"从浏览器中formdata的位置寻找"}
reques.post(url,data=data)
百度翻译
import requests
import json
import sys
class Fanyi:
def __init__(self, query_string):
self.query_string = query_string
self.url = "http://fanyi.baidu.com/basetrans"
self.headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}
def get_post_data(self): # 1.url, post_data
post_data = {"query": self.query_string,
"form": "zh",
"to": "en"}
return post_data
def parse_url(self, url, data):
response = requests.post(url, data=data, headers=self.headers)
return response.content.decode() # 发送请求,获取响应
def get_ret(self, json_str): # 提取数据
temp_dict = json.loads(json_str)
ret = temp_dict["trans"][0]["dst"]
print("{}:{}".format(self.query_string, ret))
def run(self): # 实现主要逻辑
# 1.url,post_data
post_data = self.get_post_data()
# 2.发送请求,获取响应
json_str = self.parse_url(self.url, post_data)
# 3.提取数据
self.get_ret(json_str)
if __name__ == '__main__':
query_string = sys.argv[1]
fanyi = Fanyi(query_string)
fanyi.run()
使用代理
为什么要使用代理?
- 让服务器以为不是同一个客户端在请求
- 防止我们的真实地址被泄露,防止被追究
理解使用代理的过程
浏览器–>request–>代理–>requests–>Web server–>response–>代理–>response–>浏览器
使用代理
proxies = {"协议":"协议+ip+端口"}
request.get(url,proxies=proxies)
import requests
url = "http://www.sina.com.cn/"
proxies = {"http": "http://106.12.202.202:6068"}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
resp = requests.get(url, proxies=proxies, headers=headers)
print(resp.status_code)
三元运算符
if 条件成立,if前面的结果赋值给to,否则else后面的内容赋值给to
to="en" if lan=="zh" else "zh"
requests进行携带cookie登录
- cookie字符串放在headers中
- 把cookie字典交给requests请求方法的cookies
寻找登录接口的方法
- form表单action对应的url地址
- 用户名和密码的input标签中,name的值作为键,用户名和密码作为值的字典,作为post data
- 通过抓包,定位url地址
- form data
分析js,获取加密的数据
- 观察变化
- 定位js
- 通过event listener定位js的位置
- 通过搜索url地址中的关键字,通过chrome的search all file来进行搜索
- 进行分析
- 通过添加断点的方式分析js
- 执行js
- 完全的使用python模拟js的执行过程
requests处理ssl整数
requests.get(url, verify=False)
import requests
url = "https://www.12306.cn/mormhweb/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
resp = requests.get(url, headers=headers, verify=False)
print(resp.status_code)
获取响应中的cookie,转化为字典
response = requests.get(url, headers=headers)
requeset.utils.dict_from_cookiejar(response.cookies)
超时参数的使用
requests.get(url, timeout=3)
retrying模块的使用
- from retrying import retry
- 通过装饰器的方式使用retry,进行异常捕获,重新执行被装饰的函数
import requests
from retrying import retry
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
@retry(stop_max_attempt_number=3)
def _parse_url(url):
print("*" * 100)
response = requests.get(url, headers=headers, timeout=3)
assert response.status_code == 200
return response.content.decode()
def parse_url(url):
try:
html_str = _parse_url(url)
except Exception as e:
print(e)
html_str = None
return html_str
if __name__ == '__main__':
# url = "https://www.baidu.com"
url = "www.baidu.com"
print(parse_url(url))
正则
- 正则的语法
- '.'匹配到除了\n之外的所有字符,re.S,,模式下可以匹配\n
- ''转义
- ‘[]’ 或,选择其中的一个字符
- ‘|’ 或,选择’|'两边的内容
- ‘*’ 匹配0次或者多次
- ‘+’ 匹配一次或者多次
- ‘?’ 匹配0次或者多次
- ‘\s’ 空白字符,包含’空格,\n’
- ‘\d’ 数字
re模块的常用方法
- re.findall(“regex”,“str”) # 返回列表
re.sub("regex", "_", "str")
# 返回字符串re.compile("regex", re.S)
# 编译,提高匹配效率
果壳网
import requests
import re
class Guoke:
def __init__(self):
self.url_temp = "https://www.guokr.com/ask/highlight/?page={}"
self.headers = self.headers = {"Referer": "https://m.douban.com/tv/american",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
}
def get_url_list(self):
return [self.url_temp.format(i) for i in range(1, 101)]
def parse_url(self, url):
print(url)
resp = requests.get(url, headers=self.headers)
html_str = resp.content.decode()
return html_str
def get_content_list(self, html_str):
content_list = re.findall(r"<h2><a target=\"_blank\" href=\"(.*?)\">(.*?)</a></h2>", html_str, re.S)
return content_list
def save_content_list(self, content_list): # 提取数据
for content in content_list:
print(content)
def run(self):
# 1. url_list
url_list = self.get_url_list()
# 2. 遍历,发送请求,获取请求
for url in url_list:
html_str = self.parse_url(url)
# 3. 提取数据
content_list = self.get_content_list(html_str)
# 4. 保存
self.save_content_list(content_list)
if __name__ == '__main__':
guoke = Guoke()
guoke.run()
原始字符串r
- 相对于特殊符号而言,表示特殊符号的字面意思
- 用途
- 正则:忽略转义符号带来的印象,加上r之后照着写’’
- windows下文件路径
数据的分类
- 结构化数据
- json,xml
- 直接使用模块转化为python类型
- 非结构化数据
- html
- re,xpath
json模块的使用
json.loads:json字符串转化为python类型
json.dump:python类型转化为json字符串
json.load: 包含json字符串的类文件对象转化为python对象
json.dump: 那python类型存入类文件对象中
xpath语法
-
//
的用途//a
当前html页面下的所有的abookstore//book
bookstore下的所有的
-
@
的使用//a/@href
所有的a的href、、title[@lang="eng"]
选择lang=eng的title标签
-
text()
的使用//a/text()
获取所有的a下的文本//a[text()='下一页']
获取文本为下一页的a标签
-
xpath查找特定的节点
//a[1]
选择第一个//a[last()]
最后一个//a[position()<4]
前三个
-
lxml模块的使用
from lxml import etree
element = etree.HTML(html_str) # bytes或者str类型的字符串
element.xpath("xpath str") # 返回列表
etree.tostring(element) # 转化为字符串
# 数据提取时:先分组,再提取
-
xpath的包含
//a[contains(text(), "下一页")]
选择文本包含下一页的a标签//a[contains(@class, 'n')]
class包含n的a标签
-
url地址解码的方法
requests.utils.unquote()
import requests
from lxml import etree
class Qiubai:
def __init__(self):
self.temp_url = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
def get_url_list(self):
return [self.temp_url.format(i) for i in range(1, 14)]
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content
def get_content_list(self, html_str): # 提取数据
html = etree.HTML(html_str)
li_list = html.xpath("//div[@class='recommend-article']//li")
content_list = []
for li in li_list:
item = {}
item["user_name"] = li.xpath(".//div[@class='recmd-right']/a/text()")[0]
content_list.append(item)
return content_list
def save_content_list(self, content_list):
for content in content_list:
print(content)
def run(self): # 实现主要逻辑
# 1.准备url地址
url_list = self.get_url_list()
# 2.遍历发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(html_str)
# 4.保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai = Qiubai()
qiubai.run()
准备url地址
- 知道url地址的规律,知道一共多少页,准备url列表,果壳,糗百
- 不知道url地址规律,或者不知道一共多少页,准备start_url,贴吧
多线程爬虫
- t1 = threading.Thread(targe=func,args=(,))
- t1.setDaemon(True)
- t1.start() # 此时线程才会启动
队列模块的使用
from queue import Queue
q = Queue(maxsize=100)
item = {}
q.put_nowait(item) # 不等待直接放,队列满的时候回报错
q.put(item) # 放入数据,队列满的时候会等待
q.get_nowait() # 不等待直接取,队列空的时候对报错
q.get() # 取出数据,队列为空的时候会等待
q.qsize # 获取队列中现存数据的个数
q.join # 阻塞主线程,让主线程等待队列任务结束之后在结束,队列任务在计数为0时计数
q.task_done()
# put的 时候计数+1,get不会-1,get需要和task_done一起使用才会-1
q.task_done()和get()方法配合,队列计数-1
q.put() 队列计数+1
- 糗百多线程
import requests
from lxml import etree
from queue import Queue
import threading
import time
class Qiubai:
def __init__(self):
self.temp_url = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_list_queue = Queue()
def get_url_list(self):
# return [self.temp_url.format(i) for i in range(1, 14)]
for i in range(1, 14):
self.url_queue.put(self.temp_url.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.headers)
if response.status_code != 200:
self.url_queue.put(url)
# return response.content
else:
self.html_queue.put(response.content)
self.url_queue.task_done() # 让队列的计数-1
def get_content_list(self): # 提取数据
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//div[@class='recommend-article']//li")
content_list = []
for li in li_list:
item = dict()
item["user_name"] = li.xpath(".//div[@class='recmd-right']/a/text()")
content_list.append(item)
# return content_list
self.content_list_queue.put(content_list)
self.html_queue.task_done()
def save_content_list(self):
while True:
content_list = self.content_list_queue.get()
for content in content_list:
print(content)
# pass
self.content_list_queue.task_done()
def run(self): # 实现主要逻辑
thread_list = []
# 1.准备url地址
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
# 2.遍历发送请求,获取响应
for i in range(3):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
# 3.提取数据
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
# 4.保存
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) # 把子线程设置为守护线程
t.start()
for q in [self.url_queue, self.html_queue, self.content_list_queue]:
q.join() # 让主线程阻塞,等待队列计数为0
if __name__ == '__main__':
t1 = time.time()
qiubai = Qiubai()
qiubai.run()
print("total cost:", time.time() - t1)
多进程爬虫
from multiprocessing import Process
t1 = Process(targe=func, args=(,))
t1.daemo = True # 设置为守护进程
t1.start() # 此时线程才会启动
- multiprocessing
- p = multiprocessing.Process(trage=func, args=(,))
- p.daemon = True # 设置为守护线程
- p.start()
- from multiprocessing import JoinableQueue
- q = JoinableQueue()
- q.join() # 让主进程阻塞,等待队列任务结束
- q.put() # 计数+1
- q.get() # 计数不会-1
- q.task_done() # get和task_done一起使用才会减一
import requests
from lxml import etree
# from queue import Queue
# import threading
import time
from multiprocessing import Process
from multiprocessing import JoinableQueue as Queue
class Qiubai:
def __init__(self):
self.temp_url = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_list_queue = Queue()
def get_url_list(self):
# return [self.temp_url.format(i) for i in range(1, 14)]
for i in range(1, 14):
self.url_queue.put(self.temp_url.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
response = requests.get(url, headers=self.headers)
if response.status_code != 200:
self.url_queue.put(url)
# return response.content
else:
self.html_queue.put(response.content)
self.url_queue.task_done() # 让队列的计数-1
def get_content_list(self): # 提取数据
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
li_list = html.xpath("//div[@class='recommend-article']//li")
content_list = []
for li in li_list:
item = dict()
item["user_name"] = li.xpath(".//div[@class='recmd-right']/a/text()")
content_list.append(item)
# return content_list
self.content_list_queue.put(content_list)
self.html_queue.task_done()
def save_content_list(self):
while True:
content_list = self.content_list_queue.get()
for content in content_list:
print(content)
# pass
self.content_list_queue.task_done()
def run(self): # 实现主要逻辑
thread_list = []
# 1.准备url地址
t_url = Process(target=self.get_url_list)
thread_list.append(t_url)
# 2.遍历发送请求,获取响应
for i in range(3):
t_parse = Process(target=self.parse_url)
thread_list.append(t_parse)
# 3.提取数据
t_content = Process(target=self.get_content_list)
thread_list.append(t_content)
# 4.保存
t_save = Process(target=self.save_content_list)
thread_list.append(t_save)
for process in thread_list:
process.daemon = True # 把子线程设置为守护线程
process.start()
for q in [self.url_queue, self.html_queue, self.content_list_queue]:
q.join() # 让主线程阻塞,等待队列计数为0
if __name__ == '__main__':
t1 = time.time()
qiubai = Qiubai()
qiubai.run()
print("total cost:", time.time() - t1)
线程池爬虫
实例化线程对象
from multprocessing.dummy import Pool
pool = Pool(process=5) # 默认大小是cpu的个数
- from multiprocessing.dummy import Pool
- pool.apply_async(func, callback=func2)
import requests
from lxml import etree
import time
from queue import Queue
from multiprocessing.dummy import Pool
class Qiubai:
def __init__(self):
self.temp_url = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
self.queue = Queue()
self.pool = Pool(5)
self.is_running = True
self.total_request_num = 0
self.total_response_num = 0
def get_url_list(self):
for i in range(1, 14):
self.queue.put(self.temp_url.format(i))
self.total_request_num += 1
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
print(response)
return response.content
def get_content_list(self, html_str): # 提取数据
html = etree.HTML(html_str)
li_list = html.xpath("//div[@class='recommend-article']//li")
content_list = []
for li in li_list:
item = dict()
item["user_name"] = li.xpath(".//div[@class='recmd-right']/a/text()")
content_list.append(item)
return content_list
def save_content_list(self, content_list):
for content in content_list:
print(content)
def _execete_request_content_save(self): # 进行一次url地址的请求,提取,保存
url = self.queue.get()
html_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(html_str)
# 4.保存
self.save_content_list(content_list)
self.total_response_num += 1
def _callback(self, temp):
if self.is_running:
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
def run(self): # 实现主要逻辑
# 1.准备url地址
self.get_url_list()
for i in range(3): # 设置并发数为3
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
while True:
time.sleep(0.0001)
if self.total_response_num >= self.total_request_num:
self.is_running = False
break
if __name__ == '__main__':
t1 = time.time()
qiubai = Qiubai()
qiubai.run()
print("total cost:", time.time() - t1)
使用协程池实现爬虫
import gevent.monkey
gevent.monkey.patch_all()
from gevent.pool import Pool
import requests
from lxml import etree
import time
from queue import Queue
# from multiprocessing.dummy import Pool
class Qiubai:
def __init__(self):
self.temp_url = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
self.queue = Queue()
self.pool = Pool(5)
self.is_running = True
self.total_request_num = 0
self.total_response_num = 0
def get_url_list(self):
for i in range(1, 14):
self.queue.put(self.temp_url.format(i))
self.total_request_num += 1
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
print(response)
return response.content
def get_content_list(self, html_str): # 提取数据
html = etree.HTML(html_str)
li_list = html.xpath("//div[@class='recommend-article']//li")
content_list = []
for li in li_list:
item = dict()
item["user_name"] = li.xpath(".//div[@class='recmd-right']/a/text()")
content_list.append(item)
return content_list
def save_content_list(self, content_list):
for content in content_list:
print(content)
def _execete_request_content_save(self): # 进行一次url地址的请求,提取,保存
url = self.queue.get()
html_str = self.parse_url(url)
# 3.提取数据
content_list = self.get_content_list(html_str)
# 4.保存
self.save_content_list(content_list)
self.total_response_num += 1
def _callback(self, temp):
if self.is_running:
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
def run(self): # 实现主要逻辑
# 1.准备url地址
self.get_url_list()
for i in range(3): # 设置并发数为3
self.pool.apply_async(self._execete_request_content_save, callback=self._callback)
while True:
time.sleep(0.0001)
if self.total_response_num >= self.total_request_num:
self.is_running = False
break
if __name__ == '__main__':
t1 = time.time()
qiubai = Qiubai()
qiubai.run()
print("total cost:", time.time() - t1)
安装driver
- chromedriver 需要对应的chrome版本
- 提示权限不足,sudo chmod +x phantomjs
- chromdriver --version
- phantjs --version
定位元素的方法
- driver.find_element # 返回第一个元素,如果没有报错
- driver.find_elements # 返回包含元素的列表,如果没有返回空列表
获取属性和文本的方法
- 获取文本:element.text
- 获取属性值:element.get_attribute(“href”)
from selenium import webdriver
driver = webdriver.Chrome()
driver.get("https://movie.douban.com/top250")
# find_element和find_elements的区别
# ret2 = driver.find_element_by_xpath("//div[@class='item111']")
#
# print(ret2)
# print("*"*100)
# ret1 = driver.find_elements_by_xpath("//div[@class='item111']")
# print(ret1)
# 获取文本
# ret3 = driver.find_elements_by_xpath("//span[@class='title']/text()0")
# # print(ret3)
# ret3 = [i.text for i in ret3]
# print(ret3)
#
# # 获取属性的值
# ret4 = driver.find_elements_by_xpath("//span[@class='title']/..")
# print(ret4[0].get_attribute("href"))
# ret4 = [i.get_attribute("href") for i in ret4]
# print()
# 根据标签的文本定位元素
ret5 = driver.find_element_by_link_text("后页>").get_attribute("href")
print(ret5)
# 根据标签包含的文本定位元素
ret6 = driver.find_element_by_partial_link_text("后页").get_attribute("href")
print(ret6)
driver.quit()
from selenium import webdriver
import time
class Douyu:
def __init__(self):
self.start_url = "https://www/douyu.com/directory/all"
self.driver = webdriver.Chrome()
def get_content_list(self): # 提取数据
li_list = self.driver.find_elements_by_xpath("//ul[@id='live_list_contentbox']/li")
content_list = list()
for li in li_list:
item = dict()
item["title"] = li.find_element_by_xpath("./a").get_attribute("title")
item["aochor"] = li.find_element_by_xpath(".//span[@calss='dy-name ellipsis fl']").text
item["watch_num"] = li.find_element_by_xpath(".//span[@class='dy-num fr']").text
content_list.append(item)
# 提取下一页的元素
next_url = self.driver.find_elements_by_xpath("//a[@class='shark-pager-next']")
next_url = next_url[0] if len(next_url) > 0 else None
return content_list, next_url
def save_content_list(self, content_list): # 保存
pass
def run(self): # 实现主要逻辑
# 1.start_url
# 2.发送请求,获取响应
self.driver.get(self.start_url)
# 3.提取数据
content_list, next_url = self.get_content_list()
# 4.保存
self.save_content_list(content_list)
# 5.下一页数据的提取
while next_url is not None:
next_url.click() # 页面没有完全加载完,会报错
time.sleep(3)
content_list, next_url = self.get_content_list()
self.save_content_list(content_list)
if __name__ == '__main__':
douyu = Douyu()
douyu.run()
切换frame
验证码的识别
-
url地址不变,验证码不变
- 请求验证码的地址,获取响应,进行识别
-
url地址不变,验证码变化
- 请求验证码,发送登录请求,需要带上统一套cookie,才能够登录成功,对应可以使用requests.Session()来实现
-
selenium处理验证码
- 带上selenium的driver中的cookie来请求验证码
- selenium截屏,获取验证