学习Python的日子 爬虫(6)

抓取淘宝美食数据案例

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import re
from bs4 import BeautifulSoup

driver = webdriver.Chrome()

wait = WebDriverWait(driver, 5)


def next_page(page):
    print("正在切换", page, "页")
    input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager div div div div.form input")))
    input.clear()
    input.send_keys(page)
    submit = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form span.btn.J_Submit")
    submit.click()
    wait.until(
        EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager div div div ul li.item.active"), str(page)))
    get_product_info(page)


def get_product_info(page):
    print("正在解析", page, "页")
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    product_lists = soup.select("#mainsrp-itemlist .items .item")
    for product in product_lists:
        item = {}
        location = product.select(".location")[0].text
        shopname = product.select(".shopname")[0].text
        title = product.select(".title .J_ClickStat")[0].text
        image = product.select("img")[0]["data-src"]
        data_link = product.select(".pic-link.J_ClickStat.J_ItemPicA")[0]["href"]

        item["location"] = location
        item["shopname"] = shopname
        item["title"] = title
        item["image"] = image
        item["data_link"] = data_link
        print(item)


def get_total_pagel():
    driver.get("https://www.taobao.com/")
    input = wait.until(EC.presence_of_element_located((By.ID, "q")))
    input.send_keys("美食")
    driver.find_element_by_css_selector(".btn-search").click()
    total = driver.find_element_by_class_name("total").text
    total_num = re.compile(r'\d+').search(total).group()
    get_product_info(1)
    return total_num


if __name__ == '__main__':
    toto_page = get_total_pagel()

    for page in range(2, int(toto_page) + 1):
        next_page(page)
    driver.quit()

尝试对验证码进行机器识别处理--登录知乎案例

import time
from selenium import webdriver
import base64
from PIL import Image
from pytesseract import image_to_string


def get_image_text():
    reuslt_text = None
    try:
        image = Image.open("captcha.jpg")
        reuslt_text = image_to_string(image)
        print("reuslt_text:", reuslt_text)
    except Exception as e:
        print(e)

    if reuslt_text == None:
        reuslt_text = input("请输入验证码:")

    return reuslt_text


# 保存验证码
def save_captcha(base64data):
    base64data = base64data[len("data:image/jpg;base64,"):].replace("
", "").replace("%0A", "")
    print("image_data==", base64data)
    image_data = base64.b64decode(base64data)

    with open("captcha.jpg", "wb") as file:
        file.write(image_data)

    print("验证保存成功!")


def zhihulogin():
    driver = webdriver.Chrome()
    # 进入登录页面
    driver.get("https://www.zhihu.com/signup?next=%2F")
    time.sleep(2)
    # 点击切换到登录页面
    driver.find_element_by_xpath('//div[@class="SignContainer-switch"]/span').click()

    # 输入账号
    driver.find_element_by_name("username").send_keys("账号")

    # 输入密码
    driver.find_element_by_name("password").send_keys("密码")
    driver.save_screenshot("输入账号和密码完毕.png")

    if driver.page_source.find("Captcha-englishContainer") != -1:
        src = driver.find_element_by_class_name("Captcha-englishImg").get_attribute("src")
        print(src)

        if len(src) > len("data:image/jpg;base64,null"):
            # 英文验证码出现
            print("英文验证码出现")
            save_captcha(src)

            # 识别验证码
            reuslt_text = get_image_text()
            input = driver.find_element_by_xpath('//div[@class="Input-wrapper"]/input')
            input.send_keys(reuslt_text)

        else:
            print("没有英文验证码")



    elif driver.page_source.find("Captcha-chineseContainer") != 1:
        src = driver.find_element_by_class_name("Captcha-chineseImg").get_attribute("src")
        print(src)

        if len(src) > len("data:image/jpg;base64,null"):
            # 中文验证码出现
            print("出现中文验证码了")
            save_captcha(src)
            # 退出浏览器
            driver.quit()

        else:
            print("没有中文验证码")

    # 判断英文和中文验证处理

    sibmit = driver.find_element_by_xpath('//div[@class="Login-content"]/form/button')

    # 点击登录
    sibmit.click()
    time.sleep(5)

    # 保存登录成功的照片
    driver.save_screenshot("登录成功.png")


if __name__ == "__main__":
    zhihulogin()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值