10月25日学习总结
一、复习
url=’ ',统一资源定位符
params={}, 参数
headers={},请求头
cookies,浏览器本地存储技术
proxies={},代理
auth=(),身份验证(发短信用)
timeout=5,超时
verify,验证
响应状态码:
2xx - 成功
3xx - 重定向
4xx - 请求有问题
400 - Bad request
401 - Unauthorized
403 - Forbidden
404 - Not Found
405 - Method not allowed
5xx - 服务器故障
500 - service unavailable
502 - gateway error
re - regular expression
量词
{n}
{n,m}
{n,}
{,n}
* - 表示零次或者任意多次(*?-惰性匹配 - 尽可能短的匹配)
+ - 表示一次或任意多次(+?-惰性匹配 - 尽可能短的匹配)
? - 表示0次或者一次
re模块
re.findall/re.search/re.match
Pattern--->findall/search/match
正则表达式解析html,获取豆瓣250电影title
import re
import requests
# 创建正则表达式对象
title_pattern = re.compile(r'<span class="title">([^&;]*?)</span>')
for page in range(10):
resp = requests.get(
url='https://movie.douban.com/top250',
params={
'start': page * 25
},
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
},
)
print(resp.status_code)
if resp.status_code == 200:
titles_list = title_pattern.findall(resp.text)
for index, title in enumerate(titles_list):
print(index, title)
二、获取豆瓣250电影详情
import bs4
import openpyxl
import requests
def fetch_movie_detail(url):
resp = requests.get(
url=url,
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
},
)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
genre_spans = soup.select('#info > span[property="v:genre"]')
genre = '/'.join([genre_span.text for genre_span in genre_spans])
last_genre_span = genre_spans[-1] # type:bs4.Tag
country_span = last_genre_span.find_next_sibling('span')
country = str(country_span.next_sibling).strip()
language_span = country_span.find_next_sibling('span')
language = str(language_span.next_sibling).strip()
runtime = soup.select_one('#info > span[property="v:runtime"]').text
return genre, country, language, runtime
def main():
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = 'Top250'
sheet.append(('电影名字', '评分', '名句', '类型', '制片国家', '语言', '片长'))
for page in range(10):
resp = requests.get(
url=f'https://movie.douban.com/top250?start={page * 25}',
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
},
)
print(resp.status_code)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
div_list = soup.select('div.info')
for div in div_list: # type:bs4.Tag
detail_url = div.select_one('div.hd > a').attrs['href']
title = div.select_one('div.hd > a > span.title').text
rating = div.select_one('div.bd > div > span.rating_num').text
motto_span = div.select_one('div.bd > p.quote > span')
motto = motto_span.text if motto_span else '--------'
genre, country, language, runtime = fetch_movie_detail(detail_url)
sheet.append((title, rating, motto, genre, country, language, runtime))
workbook.save('豆瓣top250电影数据.xlsx')
if __name__ == '__main__':
main()
三、需要多个请求头,使用Session对象访问URL
import bs4
import openpyxl
import requests
def fetch_movie_detail(session, url):
resp = session.get(url=url)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
genre_spans = soup.select('#info > span[property="v:genre"]')
genre = '/'.join([genre_span.text for genre_span in genre_spans])
last_genre_span = genre_spans[-1] # type:bs4.Tag
country_span = last_genre_span.find_next_sibling('span')
country = str(country_span.next_sibling).strip()
language_span = country_span.find_next_sibling('span')
language = str(language_span.next_sibling).strip()
runtime = soup.select_one('#info > span[property="v:runtime"]').text
return genre, country, language, runtime
def initialize_session(session):
"""初始化Session对象"""
session.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
session.proxies = {}
session.cookies = ...
session.verify = False
def main():
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = 'Top250'
sheet.append(('电影名字', '评分', '名句', '类型', '制片国家', '语言', '片长'))
# 创建Session对象(会话)
session = requests.Session()
initialize_session(session)
for page in range(10):
# 通过给Session对象发get消息实现GET请求
resp = session.get(url=f'https://movie.douban.com/top250?start={page * 25}')
print(resp.status_code)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
div_list = soup.select('div.info')
for div in div_list: # type:bs4.Tag
get_movie_info(session, div, sheet)
workbook.save('豆瓣top250电影数据.xlsx')
def get_movie_info(session, div, sheet):
detail_url = div.select_one('div.hd > a').attrs['href']
title = div.select_one('div.hd > a > span.title').text
rating = div.select_one('div.bd > div > span.rating_num').text
motto_span = div.select_one('div.bd > p.quote > span')
motto = motto_span.text if motto_span else '--------'
genre, country, language, runtime = fetch_movie_detail(session, detail_url)
sheet.append((title, rating, motto, genre, country, language, runtime))
if __name__ == '__main__':
main()
四、爬取boss直聘数据(selenium)
"""
selenium ---> 自动化测试工具 --->IDE/ webdriver / RemoteControl
爬取boss直聘上招数据分析的公司
"""
import bs4
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
browser = webdriver.Chrome()
browser.get('https://www.zhipin.com/')
browser.implicitly_wait(10)
# WebElement ---> send_keys() / click()
query_input = browser.find_element(By.CSS_SELECTOR, 'input[name="query"]')
query_input.send_keys('数据分析') # 模拟用户输入
# query_input.send_keys(Keys.ENTER)
query_button = browser.find_element(By.CSS_SELECTOR, 'button.btn-search')
query_button.click() # 模拟用户点击
# print(browser.page_source) # 拿页面源代码
# 可以使用正则表达式或BeautifulSoup4去解析页面
soup = bs4.BeautifulSoup(browser.page_source, 'html.parser')
# [Tag] --->text/attrs['...']
company_anchors = soup.select('div.info-company > div > h3 > a')
# 也可以直接使用Chrome对象的find_element/find_elements查找页面元素
# [WebElement]--->text/get_attribute('...') 取标签的文本内容和标签属性
# company_anchors = browser.find_elements(By.CSS_SELECTOR, 'div.info-company > div > h3 > a')
for company_anchor in company_anchors: # type: WebElement
print(company_anchor.attrs['href'])
print(company_anchor.text)
# print(company_anchor.get_attribute('href'))
五、homework(爬取boss直聘数学分析相关岗位数据)
import bs4
import openpyxl
from selenium import webdriver
from selenium.webdriver.common.by import By
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = '数据分析岗位'
sheet.append(('岗位', '公司', '薪资', '经验', '公司类型'))
browser = webdriver.Chrome()
browser.get('https://www.zhipin.com/')
browser.implicitly_wait(10)
query_input = browser.find_element(By.CSS_SELECTOR, 'input[name="query"]')
query_input.send_keys('数据分析') # 模拟用户输入
query_button = browser.find_element(By.CSS_SELECTOR, 'button.btn-search')
query_button.click() # 模拟用户点击
for x in range(1, 9):
browser.get(f'https://www.zhipin.com/c101270100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page={x}&ka=page-{x}')
soup = bs4.BeautifulSoup(browser.page_source, 'html.parser')
job_name = soup.select('div.primary-wrapper > div > div.job-title > span.job-name > a')
company_anchors = soup.select('div.info-company > div > h3 > a')
job_salary = soup.select('div.primary-wrapper > div > div.job-limit.clearfix > span')
job_ep = soup.select('div.primary-wrapper > div > div.job-limit.clearfix > p')
company_genre = soup.select('div.info-company > div > p > a')
for name, company_anchor, salary, ep, genre in zip(job_name, company_anchors, job_salary, job_ep, company_genre):
sheet.append((name.text, company_anchor.text, salary.text, ep.text, genre.text))
workbook.save('成都数据分析岗位数据.xlsx')