爬虫2

最新推荐文章于 2024-06-06 01:44:11 发布

xiao_liushui

最新推荐文章于 2024-06-06 01:44:11 发布

阅读量142

点赞数

本文链接：https://blog.csdn.net/xiao_liushui/article/details/102827014

版权

bs4基本使用

from bs4 import BeautifulSoup

from bs4.element import Tag

# 多加练习，Python，使用，知识变成你的了
data = '''<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>soup测试</title>
    <title class="warm" id="hello" href="http://www.google.com/">你那温情的一笑，搞得我瑟瑟发抖</title>
</head>
<body>
<div class="tang">
    <ul>
        <li class="hello" id="world"><a id="world" href="http://www.baidu.com" title="出塞"><!--秦时明月汉时关，万里长征人未还，但使龙城飞将在，不教胡马度阴山--></a></li>
        <list><a href="https://www.baidu.com" title="出塞" style="font-weight: bold"><!--秦时明月汉时关，万里长征人未还，但使龙城飞将在，不教胡马度阴山--></a></list>
        <li><ul href="http://www.163.com" class="taohua" title="huahua">人面不知何处去，桃花依旧笑春风</ul></li>
        <lists class="hello"><a href="http://mi.com" id="hong" title="huahua">去年今日此门中，人面桃花相映红</a></lists>
        <li id="wo"><div href="http://qq.com" name="he" id="gu">故人西辞黄鹤楼，烟花三月下扬州</div></li>
    </ul>
    <ul>
        <li class="hello" id="sf"><a href="http://www.baidu.com" title="出塞"><!--秦时明月汉时关，万里长征人未还，但使龙城飞将在，不教胡马度阴山--></a></li>
        <list><a href="https://www.baidu.com" title="出塞"><!--秦时明月汉时关，万里长征人未还，但使龙城飞将在，不教胡马度阴山--></a></list>
        <li><a href="http://www.163.com" class="taohua">人面不知何处去，桃花依旧笑春风</a></li>
        <lists class="hello"><a href="http://mi.com" id="fhsf">去年今日此门中，人面桃花相映红，不知桃花何处去，出门依旧笑楚风</a></lists>
        <li id="fs"><a href="http://qq.com" name="he" id="gufds">故人西辞黄鹤楼，烟花三月下扬州</a></li>
    </ul>
</div>
<div id="meng">
    <p class="jiang">
        <span>三国猛将</span>
    <ol>
        <pl>关羽</pl>
        <li>张飞</li>
        <li>赵云</li>
        <zl>马超</zl>
        <li>黄忠</li>
    </ol>
    <div class="cao" id="h2">
        <ul>
            <li>典韦</li>
            <li>许褚</li>
            <li>张辽</li>
            <li>张郃</li>
            <li>于禁</li>
            <li>夏侯惇</li>
        </ul>
    </div>
    </p>
</div>
</body>
</html>'''

if __name__ == '__main__':
    # 参数二，是解析器，lxml---->xpath
    # soup整体，是Python对象：bs4.BeautifulSoup
    soup = BeautifulSoup(data,'lxml')
    # 查看soup结构
    # Tag标签 bs4.element.Tag,属性方式进行查找，查到当前第一个
    t = soup.title
    print(t)
    # print(type(t))
    # bs4.element.NavigableString
    s = soup.title.string
    print(s)
    # print(type(s))
    # bs4.element.Comment
    s = soup.li.string
    print(s)
    # print(type(s))

    # 子节点，直接节点
    # c = soup.body.div.ul.children
    # print(c)
    # for item in c:
    #     print(item)
    ul = soup.body.div.ul
    #孙节点
    d = ul.children
    for item in d:
        print(item.string)

    # 搜索文档树
    # print(soup.find(name = 'li',id = 'wo'))
    #     # print(soup.find(name = 'li',attrs={'class':'hello','id':'world'}))
    # lis = soup.find_all('li',class_ = 'hello')
    # print(lis)

    # css语法，soup.select()
    # css语法通过标签查找
    # ret = soup.select('li#wo')
    # ret = soup.select('lists.hello a')
    # print(ret)
    # 通过类名查找
    # print(soup.select('.hello'))
    # 通过id查找 ：#
    # print(soup.select('#sf'))
    # 组合查找
    # print(soup.select('li#world'))
    # 空格，找子标签
    # print(soup.select('li #world'))
    # print(soup.select('li a'))
    # print(soup.select('div > ol > pl,zl'))
    # 属性查找
    # print(soup.select('div[class="cao"][id="h2"] > ul > li'))
    # print(soup.select('div[class="cao"][id="h2"] > ul > li')[1])

    # tag = soup.find_all(name='title')[1]
    # print(tag)
    # print(tag.name)
    # print(tag.attrs)
    # print(tag.attrs['href'])
    # print(tag.string)
    # print(tag['href'])
    # print(tag.get_text())

jsonpath基本使用

import jsonpath
    import json
    data = '''{ "store": {
    "book": [ 
      { "category": "reference",
        "author": "李白",
        "title": "Sayings of the Century",
        "price": 8.95
      },
      { "category": "fiction",
        "author": "杜甫",
        "title": "Sword of Honour",
        "price": 12.99
      },
      { "category": "fiction",
        "author": "白居易",
        "title": "Moby Dick",
        "isbn": "0-553-21311-3",
        "price": 8.99
      },
      { "category": "fiction",
        "author": "苏轼",
        "title": "The Lord of the Rings",
        "isbn": "0-395-19395-8",
        "price": 22.99
      }
    ],
    "bicycle": {
      "color": "red",
      "price": 19.95
    }
  }
}'''
    json_obj = json.loads(data, encoding='utf-8')

    # print(jsonpath.jsonpath(json_obj,'$.store.book[*].author'))
    # print(jsonpath.jsonpath(json_obj,'$..author'))
    # print(jsonpath.jsonpath(json_obj,'$.store.book[?(@.price>12)]'))
    # jsonpath 索引从0开始的
    # print(jsonpath.jsonpath(json_obj,'$.store.book[0]'))
    # @当前，当前列表长度 - 1 最后一个对象
    # print(jsonpath.jsonpath(json_obj,'$.store.book[(@.length -1)]'))
    print(jsonpath.jsonpath(json_obj,'$.store.book[?(@.isbn)]'))

selenium基本使用
导包:

from selenium import webdriver
url = 'http://www.qq.com/'
if __name__ == '__main__':
    driver = webdriver.Chrome()#打开浏览器
    driver.get(url)#获取url
    time.sleep(2)
    # driver.maximize_window()#窗口最大化
    driver.set_window_size(x,y)#设置浏览器窗口大小
    time.sleep(2)
    driver.save_screenshot('./qq.png')#打开后截图保存名为qq的图片
    driver.close()

	driver.find_element_by_class_name("s_ipt").send_keys('macbook pro')#找到浏览器输入框输入MacBook pro   
	driver.find_element(By.ID,'su').click()#.click()是点击按钮

获取智联数据

driver = webdriver.Chrome()
    driver.implicitly_wait(30)
    driver.get(url)
    time.sleep(5)
    driver.find_element_by_xpath('//div[@class="risk-warning__content"]/button').click()
    with open('./zhilian.html','w',encoding='utf-8') as fp:
        fp.write(driver.page_source)#page_source方法是获取网页数据(html)格式


driver.back()#网页后退
driver.forward()#网页前进

当在网页打开两个网页的时候需要 windows handles

 windows = driver.window_handles
    print(windows)
    driver._switch_to.window(windows[1])#进入第二个网页 ,下面的代码对第二个网页进行搜索操作

selenium–js

execute_script('$(arguments[0]).fadeOut()',img)#括号里面填 一些js语句实现操作

driver.execute_script("var q = document.getElementById(\"kw\");"
                          "q.style.border=\"2px solid red\";")

#一些网页登录时有手机号和账号密码登录  点击账号密码登录时是 iframe相当于另一个新窗口
  # iframe 独立模块,相当于窗口
    iframe = driver.find_element_by_xpath('//div[@class="login"]/iframe')
    print('------------',iframe)
    driver._switch_to.frame(iframe)
    account = driver.find_element_by_class_name('account-tab-account')
    print('+++++++++++',account)
    account.click()

tesseract库的使用
用于识别图片内容

from PIL import Image

import pytesseract

if __name__ == '__main__':
    image = Image.open('./chi.png')
    # result = pytesseract.image_to_data(image)
    result = pytesseract.image_to_string(image)
    print(result)

#自动登录网页账号和识别验证码登录

from selenium import webdriver
import pytesseract
from PIL import Image
import time

url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'

def get_captcha():
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.implicitly_wait(10)
    driver.get(url)
    # 元素
    img = driver.find_element_by_id('imgCode')
    img.screenshot('./captcha.png')
    # driver.save_screenshot('./poem.png')
    # image = Image.open('./poem.png')
    # # 左上角坐标
    # loc = img.location
    # # 图片宽度高度
    # size = img.size
    # # 矩形区域
    # # 160,260
    # # 234,291
    # rec = (loc['x'] + 32, loc['y'] + 54, 234, 291)
    # captcha = image.crop(rec)
    # 保存到文件中
    # captcha.save('./captcha.png')
    # return driver
    return drive


def recognize_captcha():
    captcha = Image.open('./captcha.png')
    gray = captcha.convert('L')
    data = gray.load()
    w,h = captcha.size
    for x in range(w):
        for y in range(h):
            # 0 ~ 255 0纯黑，255纯白
            if data[x,y] < 140:
                data[x,y] = 0
            else:
                data[x,y] = 255
    code = pytesseract.image_to_string(gray)
    return code


def login(drive,code):
    drive.find_element_by_id('email').send_keys('455098435@qq.com')
    drive.find_element_by_id('pwd').send_keys('31415926abc')
    drive.find_element_by_id('code').send_keys(code)
    time.sleep(1)

    drive.find_element_by_id('denglu').click()

if __name__ == '__main__':
    drive = get_captcha()
    # code验证码，有可能出错
    code = recognize_captcha()
    print('----------------',code)
    login(drive,code)

xiao_liushui

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫2

bs4基本使用from bs4 import BeautifulSoupfrom bs4.element import Tag# 多加练习，Python，使用，知识变成你的了data = '''<!DOCTYPE html><html lang="en"><head> <meta charset="UTF-8"> &l...
复制链接

扫一扫