10月25日学习总结

10月25日学习总结

一、复习

url=’ ',统一资源定位符

params={}, 参数

headers={},请求头

cookies,浏览器本地存储技术

proxies={},代理

auth=(),身份验证(发短信用)

timeout=5,超时

verify,验证

响应状态码:
2xx - 成功
3xx - 重定向
4xx - 请求有问题
  400 - Bad request
  401 - Unauthorized
  403 - Forbidden
  404 - Not Found
  405 - Method not allowed
5xx - 服务器故障
  500 - service unavailable
  502 - gateway error
re - regular expression

量词
{n}
{n,m}
{n,}
{,n}
*   -   表示零次或者任意多次(*?-惰性匹配 - 尽可能短的匹配)
+   -   表示一次或任意多次(+?-惰性匹配 - 尽可能短的匹配)
?   -   表示0次或者一次

re模块
    re.findall/re.search/re.match
    Pattern--->findall/search/match
正则表达式解析html,获取豆瓣250电影title
import re

import requests

# 创建正则表达式对象
title_pattern = re.compile(r'<span class="title">([^&;]*?)</span>')

for page in range(10):
    resp = requests.get(
        url='https://movie.douban.com/top250',
        params={
            'start': page * 25
        },
        headers={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
        },
    )
    print(resp.status_code)
    if resp.status_code == 200:
        titles_list = title_pattern.findall(resp.text)
        for index, title in enumerate(titles_list):
            print(index, title)

二、获取豆瓣250电影详情

import bs4
import openpyxl
import requests


def fetch_movie_detail(url):
    resp = requests.get(
        url=url,
        headers={
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
        },
    )
    if resp.status_code == 200:
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        genre_spans = soup.select('#info > span[property="v:genre"]')
        genre = '/'.join([genre_span.text for genre_span in genre_spans])
        last_genre_span = genre_spans[-1]  # type:bs4.Tag
        country_span = last_genre_span.find_next_sibling('span')
        country = str(country_span.next_sibling).strip()
        language_span = country_span.find_next_sibling('span')
        language = str(language_span.next_sibling).strip()
        runtime = soup.select_one('#info > span[property="v:runtime"]').text
        return genre, country, language, runtime


def main():
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = 'Top250'
    sheet.append(('电影名字', '评分', '名句', '类型', '制片国家', '语言', '片长'))
    for page in range(10):
        resp = requests.get(
            url=f'https://movie.douban.com/top250?start={page * 25}',
            headers={
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
            },
        )
        print(resp.status_code)
        if resp.status_code == 200:
            soup = bs4.BeautifulSoup(resp.text, 'html.parser')
            div_list = soup.select('div.info')
            for div in div_list:  # type:bs4.Tag
                detail_url = div.select_one('div.hd > a').attrs['href']
                title = div.select_one('div.hd > a > span.title').text
                rating = div.select_one('div.bd > div > span.rating_num').text
                motto_span = div.select_one('div.bd > p.quote > span')
                motto = motto_span.text if motto_span else '--------'
                genre, country, language, runtime = fetch_movie_detail(detail_url)
                sheet.append((title, rating, motto, genre, country, language, runtime))
    workbook.save('豆瓣top250电影数据.xlsx')


if __name__ == '__main__':
    main()

三、需要多个请求头,使用Session对象访问URL

import bs4
import openpyxl
import requests


def fetch_movie_detail(session, url):
    resp = session.get(url=url)
    if resp.status_code == 200:
        soup = bs4.BeautifulSoup(resp.text, 'html.parser')
        genre_spans = soup.select('#info > span[property="v:genre"]')
        genre = '/'.join([genre_span.text for genre_span in genre_spans])
        last_genre_span = genre_spans[-1]  # type:bs4.Tag
        country_span = last_genre_span.find_next_sibling('span')
        country = str(country_span.next_sibling).strip()
        language_span = country_span.find_next_sibling('span')
        language = str(language_span.next_sibling).strip()
        runtime = soup.select_one('#info > span[property="v:runtime"]').text
        return genre, country, language, runtime


def initialize_session(session):
    """初始化Session对象"""
    session.headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
    }
    session.proxies = {}
    session.cookies = ...
    session.verify = False


def main():
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = 'Top250'
    sheet.append(('电影名字', '评分', '名句', '类型', '制片国家', '语言', '片长'))
    # 创建Session对象(会话)
    session = requests.Session()
    initialize_session(session)
    for page in range(10):
        # 通过给Session对象发get消息实现GET请求
        resp = session.get(url=f'https://movie.douban.com/top250?start={page * 25}')
        print(resp.status_code)
        if resp.status_code == 200:
            soup = bs4.BeautifulSoup(resp.text, 'html.parser')
            div_list = soup.select('div.info')
            for div in div_list:  # type:bs4.Tag
                get_movie_info(session, div, sheet)
    workbook.save('豆瓣top250电影数据.xlsx')


def get_movie_info(session, div, sheet):
    detail_url = div.select_one('div.hd > a').attrs['href']
    title = div.select_one('div.hd > a > span.title').text
    rating = div.select_one('div.bd > div > span.rating_num').text
    motto_span = div.select_one('div.bd > p.quote > span')
    motto = motto_span.text if motto_span else '--------'
    genre, country, language, runtime = fetch_movie_detail(session, detail_url)
    sheet.append((title, rating, motto, genre, country, language, runtime))


if __name__ == '__main__':
    main()

四、爬取boss直聘数据(selenium)

"""
selenium ---> 自动化测试工具 --->IDE/ webdriver / RemoteControl
爬取boss直聘上招数据分析的公司
"""
import bs4

from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement

browser = webdriver.Chrome()
browser.get('https://www.zhipin.com/')
browser.implicitly_wait(10)

# WebElement ---> send_keys() / click()
query_input = browser.find_element(By.CSS_SELECTOR, 'input[name="query"]')
query_input.send_keys('数据分析')  # 模拟用户输入

# query_input.send_keys(Keys.ENTER)
query_button = browser.find_element(By.CSS_SELECTOR, 'button.btn-search')
query_button.click()  # 模拟用户点击
# print(browser.page_source) # 拿页面源代码
# 可以使用正则表达式或BeautifulSoup4去解析页面
soup = bs4.BeautifulSoup(browser.page_source, 'html.parser')
# [Tag] --->text/attrs['...']
company_anchors = soup.select('div.info-company > div > h3 > a')
# 也可以直接使用Chrome对象的find_element/find_elements查找页面元素
# [WebElement]--->text/get_attribute('...')   取标签的文本内容和标签属性
# company_anchors = browser.find_elements(By.CSS_SELECTOR, 'div.info-company > div > h3 > a')
for company_anchor in company_anchors:  # type:  WebElement
    print(company_anchor.attrs['href'])
    print(company_anchor.text)
    # print(company_anchor.get_attribute('href'))

五、homework(爬取boss直聘数学分析相关岗位数据)

import bs4
import openpyxl

from selenium import webdriver
from selenium.webdriver.common.by import By


workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = '数据分析岗位'
sheet.append(('岗位', '公司', '薪资', '经验', '公司类型'))
browser = webdriver.Chrome()
browser.get('https://www.zhipin.com/')
browser.implicitly_wait(10)
query_input = browser.find_element(By.CSS_SELECTOR, 'input[name="query"]')
query_input.send_keys('数据分析')  # 模拟用户输入
query_button = browser.find_element(By.CSS_SELECTOR, 'button.btn-search')
query_button.click()  # 模拟用户点击
for x in range(1, 9):
    browser.get(f'https://www.zhipin.com/c101270100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90&page={x}&ka=page-{x}')
    soup = bs4.BeautifulSoup(browser.page_source, 'html.parser')
    job_name = soup.select('div.primary-wrapper > div > div.job-title > span.job-name > a')
    company_anchors = soup.select('div.info-company > div > h3 > a')
    job_salary = soup.select('div.primary-wrapper > div > div.job-limit.clearfix > span')
    job_ep = soup.select('div.primary-wrapper > div > div.job-limit.clearfix > p')
    company_genre = soup.select('div.info-company > div > p > a')
    for name, company_anchor, salary, ep, genre in zip(job_name, company_anchors, job_salary, job_ep, company_genre):
        sheet.append((name.text, company_anchor.text, salary.text, ep.text, genre.text))
    workbook.save('成都数据分析岗位数据.xlsx')
【为什么学爬虫?】        1、爬虫入手容易,但是深入较难,如何写出高效率的爬虫,如何写出灵活性高可扩展的爬虫都是一项技术活。另外在爬虫过程中,经常容易遇到被反爬虫,比如字体反爬、IP识别、验证码等,如何层层攻克难点拿到想要的数据,这门课程,你都能学到!        2、如果是作为一个其他行业的开发者,比如app开发,web开发,学习爬虫能让你加强对技术的认知,能够开发出更加安全的软件和网站 【课程设计】 一个完整的爬虫程序,无论大小,总体来说可以分成三个步骤,分别是:网络请求:模拟浏览器的行为从网上抓取数据。数据解析:将请求下来的数据进行过滤,提取我们想要的数据。数据存储:将提取到的数据存储到硬盘或者内存中。比如用mysql数据库或者redis等。那么本课程也是按照这几个步骤循序渐进的进行讲解,带领学生完整的掌握每个步骤的技术。另外,因为爬虫的多样性,在爬取的过程中可能会发生被反爬、效率低下等。因此我们又增加了两个章节用来提高爬虫程序的灵活性,分别是:爬虫进阶:包括IP代理,多线程爬虫,图形验证码识别、JS加密解密、动态网页爬虫、字体反爬识别等。Scrapy和分布式爬虫:Scrapy框架、Scrapy-redis组件、分布式爬虫等。通过爬虫进阶的知识点我们能应付大量的反爬网站,而Scrapy框架作为一个专业的爬虫框架,使用他可以快速提高我们编写爬虫程序的效率和速度。另外如果一台机器不能满足你的需求,我们可以用分布式爬虫让多台机器帮助你快速爬取数据。 从基础爬虫到商业化应用爬虫,本套课程满足您的所有需求!【课程服务】 专属付费社群+定期答疑
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值