数据爬取
requests的使用
爬取数据的基本步骤
-
获取想要数据的网页地址
-
导入
requests
模块 -
使用
requests.get
得到获取-
response = requests.get(url, headers=)
-
直接获取不到先尝试找到网页的数据接口,直接通过接口获取数据
-
找不到接口,尝试加入
User-Agent
和cookie
属性进headers
-
再不行就放弃
requests
使用selenium
-
属性 功能 response.status_cude
检测请求是否成功 response.encoding
定义编码 response.content
把数据转换成二进制 respnse.text
把数据转换成字符串
-
selenium的使用
基本使用步骤
- 导入模块
from selenium import webdriver
- 建立浏览器对象
b = webdriver.Chrome()
其他浏览器也行,需要安装浏览器驱动
- 获取网页数据
b.get(url)
b.page_source
- 关闭网页
b.close()
用法提升
options
from selenium import webdriver
from selenium.webdriver import ChromeOptions
# 1.创建设置对象
options = ChromeOptions()
# 2.取消自动测试检测
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 3.取消图片的加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
b = webdriver.Chrome(options=options)
url = 'https://www.baidu.com'
b.get(url)
interaction
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
b = webdriver.Chrome()
def jing_dong():
# 打开京东首页
b.get('https://www.jd.com/')
# 获取登录按钮并点击
login_btn = b.find_element_by_css_selector('.user_login')
login_btn.click()
# 获取账号登录按钮
user_btn = b.find_element_by_css_selector('.login-tab.login-tab-r')
user_btn.click()
user_name = b.find_element_by_css_selector('#loginname')
password = b.find_element_by_css_selector('#nloginpwd')
user_name.send_keys('abc')
password.send_keys('123456')
# 点击登录按钮
login_btn1 = b.find_element_by_css_selector('#loginsubmit')
login_btn1.click()
# 拖动滑块
# #JDJRV-wrap-loginsubmit > div > div > div > div.JDJRV-slide-bg > div.JDJRV-slide-inner.JDJRV-slide-btn
slider = b.find_element_by_css_selector(
'#JDJRV-wrap-loginsubmit > div > div > div > div.JDJRV-slide-bg > div.JDJRV-slide-inner.JDJRV-slide-btn')
# 创建动作链对象
action = ActionChains(b)
# 添加 按住指定标签不放 的动作并且执行
action.click_and_hold(slider)
# 添加 拖拽 的动作
# drag_and_drop_by_offset(拖拽对象,x方向的偏移,y方向偏移)
# drag_and_drop(拖拽对象,目标对象)
action.drag_and_drop_by_offset(slider, 150, 0).perform()
# 添加停顿和释放的动作
action.release()
action.perform()
def scroll():
b.get('https://www.jd.com/')
# b.execute_script('alert("hello world")')
# height = 100
# while height < 17720:
# # 滚动的js代码
# b.execute_script(f'window.scrollTo(0, {height})')
# # time.sleep(1)
# height += 10
js = """
height = 100
//添加定时器,每隔300毫秒滚动200像素
t = setInterval(function(){
max = document.body.scrollHeight
window.scrollTo(0, height)
height += 200
if(height > max){
clearInterval(t)
}
}, 300)
"""
b.execute_script(js)
if __name__ == '__main__':
scroll()
wait
"""
__author__:赖良
"""
import time
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
b = webdriver.Chrome(options=options)
b.get('https://www.51job.com/')
input_win = b.find_element_by_css_selector('#kwdselectid')
input_win.send_keys('python')
button = b.find_element_by_css_selector('body > div.content > div > div.fltr.radius_5 > div > button')
button.click()
i = 1
while i < 36:
# print(b.page_source)
time.sleep(1)
# next_btn = b.find_element_by_class_name('next')
# next_btn.click()
# 显示等待:等到满足某个条件为止
wait = WebDriverWait(b, 10)
# 等待标签加载成功
next_btn1 = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
try:
next_btn1.click()
except:
time.sleep(1)
i += 1
- 代理
ip
import requests
# =============获取代理ip===============
def get_ip():
response = requests.get(
'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=6226c130427f487385ad7b5235bc603c&count=5&expiryDate=0&format=2&newLine=3')
if response.status_code == 200:
if response.text[0] == '{':
print('获取ip失败')
else:
return response.text.split('\n')[:-1]
else:
print('请求失败')
print(response)
def get_data():
ip = get_ip()
# ip = 1
while ip == None:
ip = get_ip()
if ip:
proxies = {'http': f'http://{ip[0]}',
'https': f'https://{ip[1]}'}
response = requests.get('https://cd.fang.anjuke.com/loupan/all/p2/', proxies=proxies)
if response.status_code == 200:
print(response.text)
break
else:
print('请求失败')
print(response)
break
"""
['60.167.102.57:46674', '114.224.113.51:41162', '60.167.103.207:23267', '115.219.0.192:23061', '114.231.7.4:45872']
"""
# proxy = {'http': ip[1], 'https':ip[1]}
if __name__ == '__main__':
get_data()
数据解析
re
对得到的支付串用正则解析获取想要的数据
bs4
from bs4 import BeautifulSoup
import requests
# 1.html页面解析的方法
"""
1)正则表达式 - 出神入化的正则;不懂前端(re)
2)通过css选择器选中标签 - 熟悉css选择器;熟悉jQuery(bs4,pyQuery)
3)通过xPath获取标签 - 熟悉html结构和标签(lxml)
"""
# 准备数据
def get_data():
with open('files/anjuke.html','r',encoding='utf-8') as f:
return f.read()
# 2.bs4的使用
# 1)根据网页内容创建解析器对象
# BeautifulSoup(网页内容, 解析器类型)
# 网页内容 - html格式的字符串;一般是通过requests或者selenium去获取的页面数据
soup = BeautifulSoup(get_data(), 'lxml')
# 2)根据css选择器获取标签
"""
bs对象.select(css选择器) - 获取选择器选中的标签
bs对象.select_one(css选择器) - 获取选择器选中的第一个标签
"""
houses_name_els = soup.select('.items-name')
houses_details = soup.select('.favor-pos')
houses_price = soup.select('.price')
houses_picture = soup.select('.pic > img')
# print(houses_picture)
# print(houses_price)
# print(houses_details)
# print(houses_name_els)
# 3)根据属性和内容获取标签
# 4)获取标签内容和属性
# a.标签内容
# 标签对象.string
# 标签对象.get_text()
# 标签对象.contents
# for item in houses_name_els:
# print(item.string)
# b.标签属性
# 标签对象.attrs[属性名]
# img = iter(houses_picture)
# print(next(img))
# for x in houses_details:
# print(x.attrs['href'], x.get_text(), next(img).attrs['src'], '\n')
# for i in houses_picture:
# print(i.attrs['src'])
# for i in houses_price:
# print(i.get_text())
# 4)根据内容获取标签
# a.find_all(attrs={属性1:属性值1,属性2:属性值2,...})
# result = soup.find_all(attrs={'height': '135'})
# print(result)
# b.find_all(text=内容) 没用
# result = soup.find_all(text='12000') # ['12000', '12000', '12000']
# print(result)
import requests
from bs4 import BeautifulSoup
def get_data():
with open('files/anjuke.html', 'r', encoding='utf-8') as f:
return f.read()
def analysis_data(data):
soup = BeautifulSoup(data, 'lxml')
# 获取所有楼盘对应的div
house_box = soup.select('.item-mod')
for h_div in house_box[2:]:
print('===================================')
# print(h_div.select_one('.items-name').get_text())
try:
price = h_div.select_one('.price').get_text()
except:
price = 'None'
# print('样张:', h_div.select_one('.pic>img').attrs['src'])
# print('详情:', h_div.select_one('.pic').attrs['href'])
house = {
'name': h_div.select_one('.items-name').get_text(),
'price': price,
'pic_url': h_div.select_one('.pic>img').attrs['src'],
'particulars': h_div.select_one('.pic').attrs['href']
}
print(house)
if __name__ == '__main__':
analysis_data(get_data())
pyQuer
from pyquery import PyQuery
def get_data():
with open('files/anjuke.html', 'r', encoding='utf-8') as f:
return f.read()
# 1.创建PyQuery对象
# PyQuery(html格式的字符串or直接写网页地址)
# doc指向html标签
doc = PyQuery(get_data())
# print(type(doc))
# 2.获取标签
# PyQuery对象(css选择器) - 选中css选择器选中的标签,返回PyQuery对象
names = doc('.items-name')
# 3.获取标签内容
# PyQuery对象.text()
# 直接操作pyQuery对象会直接作用于这个对象中所有的标签
print(names.text())
# 遍历PyQuery对象得到的是这个容器中的每个标签
for x in names:
# x - 标签
# PyQuery(x) - 标签对应的jQuery对象
print(x, PyQuery(x).text())
# 4.获取标签属性
# 1)value属性
# PyQuery对象.val()
print(doc('.cell-phone').val())
# 2)普通属性
# PyQuery对象.attr(属性名)
# 只能输出一条
result = doc('.pic>img').attr('src')
print(result)
# 3)匹配标签中的属性的一部分值
result = doc("标签名[属性名*=部分属性值]")
from pyquery import PyQuery
def get_data():
with open('files/anjuke.html', 'r', encoding='utf-8') as f:
return f.read()
def analysis_data(data):
doc = PyQuery(data)
# all_house_tag = doc('.key-list.imglazyload>.item-mod')
all_house_tag = doc('.key-list.imglazyload>.item-mod').items()
# for x in all_house_tag:
# pq_x = PyQuery(x)
for pq_x in all_house_tag:
name = pq_x('.items-name').text()
area = pq_x('.building-area').text()
price = pq_x('.price').text()
title = pq_x('.group-mark.soj').text()
pic = pq_x('.pic>img').attr('src')
house = {
'name': name,
'area': area,
'price': price,
'title': title,
'pic': pic
}
print(house)
if __name__ == '__main__':
analysis_data(get_data())
xpath
# 1.xPath解析原理
# 通过需要的标签在html/xml网页结构的标签路径去获取指定标签
# 2.xml数据格式(相比json安全性更高)
# xml和json一样是一种通用的数据格式,用于多语言程序之间的数据传输。
# xml和json的比较:xml更安全(方便加密)、json更轻量级(传输的时候更快)
# xml和html一样是以节点(标签)为基本单位来提供数据的
from lxml import etree
with open('data.xml', encoding='utf-8') as f:
data = f.read()
# 1.获取树对应的根节点
# 树 - 整个xml/html
# 根节点 - xml或者html数据中最外层的标签/节点
bookStore = etree.XML(data)
print(bookStore)
# 2.通过xPath解析数据
# ====================路径====================
# 语法:节点对象.xpath(路径)
# 1)/ - 从根节点开始的绝对路径
# /A/B - 找节点A下面的B
result = bookStore.xpath('/bookStore/books/book/author')
print(result)
book_list = bookStore.xpath('/bookStore/books/book')
print(book_list)
# 2)// - 从任意位置开始按路径查找
# //A/B - 在整个xml或者html中找A节点下面的B节点
# //B - 在整个xml或者html中找B节点
name_list = bookStore.xpath('//name')
print(name_list)
# 3)
# ./ - 从当前位置开始找
# 节点名称 - 从当前位置开始找
name_list = book_list[0].xpath('./name/text()')
print(name_list)
name_list = book_list[1].xpath('name/text()')
print(name_list)
# 4)../ - 从当前节点的父节点开始查找
name_list = bookStore.xpath('//name')
name_1 = name_list[0]
result = name_1.xpath('..')
print(result)
result = name_1.xpath('../books')
print(result)
# ========================2)标签内容和属性========================
# 1)text() - 获取标签内容
store_name = bookStore.xpath('/bookStore/name/text()')
print(store_name)
store_name = bookStore.xpath('//name')[0].xpath('./text()')
print(store_name)
# 2)@属性名 - 获取指定属性值
flag = bookStore.xpath('//book/name')[0].xpath('./@flag')
print(flag)
# ======================3)谓词====================
# 路径[谓词] - 按照谓词对应的条件通过指定路径获取标签
# 1)[N] - 获取第N个标签
# //book[1]/name - 第一本书下面的name
name_1 = bookStore.xpath('//book[1]/name')
print(name_1)
# 2)
# [last()] - 获取最后一个标签
# [last()-N] - 获取最后一个的前N个标签(获取倒数第N+1个标签)
# 3)[position()<N] - 获取前N-1个标签 (>,<=,>=)
name_list = bookStore.xpath('//book[position()>=3]/name/text()')
print(name_list)
# 4)[@属性名] - 获取拥有指定属性的标签
print(bookStore.xpath('//name[@flag]/text()'))
# [@属性名=数值] >,<,>=,<=
print(bookStore.xpath('//name[@flag="hot"]/text()'))
# 6)[标签名=值]、[标签名>数值]<,>=,<=
book_list = bookStore.xpath('//book[name="流程的Python"]')
print(book_list)
book_list = bookStore.xpath('//book[price>100]')
print(book_list)
# ===================4) 通配符============================
# * - 表示任意节点或者属性
result = bookStore.xpath('//book[1]/*/text()')
print(result)
# //*[@*] - 获取所有设置了属性的节点
result = bookStore.xpath('//*[@*]')
print(result)
# result = bookStore.xpath('//book[]')
# ======================5) 选取若干路径(分支)============
# 路径1 | 路劲2 | 路径3 | ... - 同时选取每个路径对应的标签
result = bookStore.xpath('//book/name/text()|//book/price/text()')
print(result)
数据保存csv
数据写入
import csv
# ================1、写列表数据=============
def write_data1():
# 1.打开文件
# 参数newline在csv文件中必须添加
with open('files/data1.csv', 'w', encoding='utf-8', newline='') as f:
# 2.创建writer对象
# csv.writer(文件对象)
writer = csv.writer(f)
# 3.写数据
writer.writerow(['姓名', '性别', '年龄', '电话'])
writer.writerows([
['张三', '男', '30', '123213123123'],
['张三', '男', '30', '123213123123'],
['张三', '男', '30', '123213123123']]
)
def write_data2():
# ==============2、将数据以字典形式写入文件==================
# 1、打开文件
with open('files/data2.csv','w',encoding='utf-8',newline='') as f:
# 创建writer
# csv.DicWriter(文件对象、表头列表)
# 列表表头必须是要写入的字典的所有的key
writer = csv.DictWriter(f, ['name', 'sex', 'age', 'tel'])
# 将表头列表中的元素作为文件的第一行内容
writer.writeheader()
# 一行一行的写
# writer.writerow(
# {
# 'name': '张三',
# 'sex': '男',
# 'age': 18,
# 'tel': 110
# }
# )
# 一次写多行
writer.writerows([
{
'name': '李四',
'sex': '男',
'age': 18,
'tel': 110
},
{
'name': '张三',
'sex': '男',
'age': 18,
'tel': 120
}
])
if __name__ == '__main__':
write_data2()
数据读取
import csv
def reader_list():
# ===================以列表形式读=================
# 1.打开文件
with open('files/数据分析.csv',encoding='utf-8',newline='') as f:
# 2.创建redear
reader = csv.reader(f)
# 3.一行一行的获取数据
# 获取每一行的内容和迭代器获取元素的方法一样
for x in reader:
print(x)
def reader_dict():
# =================以字典的形式读================
# 1.打开文件
with open('files/data1.csv', encoding='utf-8',newline='') as f:
# 2.创建reader
# csv.DictReader(文件对象, 字典的key列表)
# 字典的key列表 - 不赋值的时候会把第一行的内容作为字典的key
reader = csv.DictReader(f)
# 3.读数据
# print(dict(next(reader)))
for x in reader:
print(dict(x))
reader_dict()
# reader_list()