Python3 爬虫笔记
1.爬虫基本请求模块
1.requests请求模块
1.requests的安装
pip3 install requests
2.发送get请求
import requests
response=requests.get('https://www.baidu.com/')
print(response.text)
3.发送post请求
url = '***'
d = {'key1': 'value1', 'key2': 'value2'}
r = requests.post(url, data=d)
print(r.text)
4.requests的参数
-
url:这是你要发送请求的地址。
-
data:在发送post请求的时候需要在data里面设置数据,get请求和post请求的区别也在于此。
-
headers:设置请求头。
-
params:设置url携带的参数。
例:https://search.jd.com/Search?keyword=手机&enc=utf-8&wq=手机&pvid=c0214112bbca49e49768e40f38f272cd 这是京东的一个手机页面。
url=‘https://search.jd.com/Search’
data=""
headers = {
“User-Agent”:“×”,‘
“cookies”:“×”
}params={
‘enc’:‘手机’,
‘enc’:‘utf-8’,
‘wq’:’%E6%89%8B%E6%9C%BA’,
‘pvid’:‘c0214112bbca49e49768e40f38f272cd’
}
5.使用代理
url="https://www.baidu.com"
#设置代理,从免费代理网站上找出一个可用的代理IP
proxies={'https':'101.236.54.97:8866'}
#使用代理IP进行访问
res=requests.get(url,proxies=proxies,timeout=10)
content=res.text
print(content)
2.urllib请求模块
1.基本方法
from urllib import request
response = request.urlopen('www.baidu.com')
page = response.read()
page = page.decode('utf-8')
2.用Requests请求
url = '**'
headers = {
'User-Agent': '**',
}
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = page.decode('utf-8')
3.使用代理
proxy = request.ProxyHandler({'http': '5.22.195.215:80'}) # 设置proxy
opener = request.build_opener(proxy) # 挂载opener
request.install_opener(opener) # 安装opener
data = parse.urlencode(data).encode('utf-8')
page = opener.open(url).read()
page = page.decode('utf-8')
print(page)
2.小案例
1.利用cookies爬取登录后的页面
登录上自己的账号之后,右击->检查:可以在这里查看cookies
代码:
from urllib import request
url = "https://music.163.com/user/home?id=1526458360"
headers = {
"Cookie":"×",
"Host":"music.163.com",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"}
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = page.decode('utf-8')
print(page)
2.bs4爬取百度贴吧
import requests
from bs4 import BeautifulSoup
class TieBa:
def __init__(self,name,page):
self.name = name
self.page = page
self.url = "https://tieba.baidu.com/f"
self.headers = {}
self.num = 0
def queryset(self):
tp = 0
for i in range(int(self.page)):
tp += 50 * i #通过观察url的变化发现,每一页50条数据
data = {'kw': self.name, 'tp': tp}
url = self.url
self.getrequest(url, data)
def getrequest(self, url, data):
print("正在爬取数据!")
rlt = requests.get(url = url,headers = self.headers,params=data)
html = rlt.content.decode()
soup = BeautifulSoup(html, "lxml")
list1 = soup.find_all('a', attrs={"class": "j_th_tit"})
# list2 = soup.find_all('div', attrs={"class": "threadlist_abs threadlist_abs_onlyline"})
print(len(list1))
# print(len(list2))
self.save(list1)
def save(self,list1):
for a in range(len(list1)):
with open("./Demo02.txt","a") as f:
f.write(list1[a].text+"\n")
print("保存成功!")
def main():
name = input("请输入你要查看的贴吧名:")
page = input("请输入你要爬取的页数(每页50条):")
tieba = TieBa(name,page)
tieba.queryset()
if __name__=='__main__':
main()
3.xpath爬取豆瓣
from lxml import etree
import requests
from bs4 import BeautifulSoup
class DouBan:
def __init__(self):
self.url = "https://movie.douban.com/cinema/nowplaying/xiangxi/"
self.headers = {}
self.num = 0
def queryset(self):
url = self.url
self.getrequest(url)
def getrequest(self, url):
print("正在爬取数据!")
rlt = requests.get(url = url,headers = self.headers)
html = rlt.content.decode()
print(html)
html = etree.HTML(html)
list1 = html.xpath('//li[@class="stitle"]/a[@class="ticket-btn"]')
print(list1)
print(len(list1))
self.save(list1)
def save(self,list1):
print("正在保存数据")
for a in range(len(list1)):
with open("./Demo03.txt","a") as f:
f.write(list1[a].text+"\n")
print("保存成功!")
def main():
douban = DouBan()
douban.queryset()
if __name__=='__main__':
main()
4.多线程爬取京东
这里只爬取了商品名和三级分类,url是已经写到文件里的,都是一些三级分类。代理可能有点乱。
# coding=utf-8
import random
import requests
from lxml import etree
import queue
import threading
import time
class JingDong:
def __init__(self):
self.headers= {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X \
10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
self.url_queue = queue.Queue()
self.html_queue = queue.Queue()
def get_url_list(self):#获取url列表
file = open('list.txt', 'r')
lines = file.readlines()
print(len(lines))
for i in range(len(lines)):
self.url_queue.put(lines[i]) #将文件里面的url存进队列
def parse_url(self):
while True:
time.sleep(random.randint(3, 4))
url = self.url_queue.get()
print("获取页面成功:"+url)
response = requests.get(url,headers=self.headers)
# 上锁
threadLock.acquire()
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
# 开锁
threadLock.release()
def get_content_list(self): #提取数据
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
print("提取数据成功")
try:
# 下一页的地址
next = html.xpath('//a[@class="pn-next"]/@href')[0]
str = next.split("?")[-1]
str = "https://list.jd.com?"+str
print("next:"+str)
except Exception as e:
print(e)
try:
category1 = html.xpath('//a[@class="crumbs-link"]')[0].text
category2 = html.xpath('(//div[@class="crumbs-nav-main clearfix"]//span)')[0].text
category3 = html.xpath('(//div[@class="crumbs-nav-main clearfix"]//span)')[1].text
except Exception as e:
print(e)
print("catrgory1:"+category1)
name_list = html.xpath('//div[@class="p-name"]//a//em')
for a in name_list:
print("name:" + a.text + "\n")
str = a.text+" "+"分类:"
str += category1+">"+category2+">"+category3
print(str)
# 上锁
threadLock.acquire()
print("正在保存数据")
# 保存数据到文件
with open("result1.txt", "a") as f:
f.write(str)
print("保存成功!")
# 开锁
threadLock.release()
# 上锁
threadLock.acquire()
# 将下一页的地址放进队列
self.html_queue.put(next)
self.html_queue.task_done()
# 开锁
threadLock.release()
def run(self):
#线程列表
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
#2.遍历,发送请求,
for i in range(2): #2个线程发送请求
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
#3.提取数据
t_content = threading.Thread(target=self.get_content_list)
thread_list.append(t_content)
for t in thread_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue,self.html_queue]:
q.join()
print("主线程结束")
threadLock = threading.Lock()
if __name__ == '__main__':
qiubai = Qiubai()
qiubai.run()
3.selenium的使用
1.定位元素语法:
find_element_by_id (返回一个元素)
find_elements_by_xpath (返回一个包含元素的列表)
find_elements_by_link_text (根据连接文本获取元素列表)
find_elements_by_partial_link_text (根据连接包含的文本获取元素列表)
find_elements_by_tag_name (根据标签名获取元素列表)
find_elements_by_class_name (根据类名获取元素列表)
2.常用操作
1.加载网页
from selenium import webdriver
driver = webdriver.PhantomJS(“c:…/pantomjs.exe”)
driver.get("https://www.baidu.com/") #获取网页
driver.save_screenshot("baidu.png") #截图操作
2.向指定位置输入文字
driver.find_element_by_id(“kw”).send_keys("美女”) #id为"kw"的输入框
3.点击指定按钮
driver.find_element_by_id("su").click() #id为“su"的按钮
4.查看请求信息
driver.page_source
driver.get_cookies()
driver.current_url
5.退出
driver.close() #退出当前页面
driver.quit() #退出浏览器