爬虫bs4

#bs4

用法说明

在这里插入图片描述

用法验证测试

#!usr/bin/env
# -*-coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup

if __name__ == "__main__":
    # UA 伪装
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
    }
    # URL
    url = "http://www.yingyuyufa.com/cixing/mingci/1576.html"
    # 请求命令
    page_text =requests.get(url=url, headers=header)
    page_text.encoding = "gb2312"
    page_text = page_text.text
    # print(page_text)
    soup = BeautifulSoup(page_text, "lxml")
    print(soup.find('div',class_='article-title').text)
    list = soup.select('.content > p')
    for i in list:
        print(i.text)
    # print(soup.find('div',class_='article-title').text)
    # list = soup.select('.content > p')
    # for i in list:
    #     print(i.text)
    # print(soup.a) #soup.tagName反回的是html第一次出现tagName
    # print(soup.div)
    # print(soup.find('div')) #相当于print(soup.div)
    # print(soup.find('div',class_='content')) #选择器
    # print(soup.find_all('p')) #反回符合要求的所有标签,反回列表
    # print(soup.select('.content')) #选择器 ID选择器 标签选择器
    # print(soup.select('.content > p')) #反回列表  层级选择器
    print(soup.select('.content > p')[0].text)
    print(soup.select('.content > p')[0].string)
    print(soup.select('.content > p')[0].get_text())
    print(soup.select('.content > p')[1].text)
    print(soup.select('.content > p')[1].string)
    print(soup.select('.content > p')[1].get_text())

0919_英语语法网.py

#!usr/bin/env
# -*-coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from docx import Document
if __name__ == "__main__":
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
    }
    # URL
    url ="http://www.yingyuyufa.com/"
    # 请求命令
    page_text =requests.get(url=url, headers=header)
    page_text.encoding = "gb2312"
    page_text = page_text.text
    soup = BeautifulSoup(page_text, "lxml")
    list_subnav_li = soup.select('#subnav > ul > li')
    list_subnav = []#
    for li in list_subnav_li:
        dict_subnay={}
        title_subnav = li.text
        url_subnav = li.a["href"]
        dict_subnay["title_subnav"] = title_subnav
        dict_subnay["url_subnav"] = url_subnav
        list_subnav.append(dict_subnay)
    #list_subnav = []  得到div(id=subnav)名词动词形容词代词冠词数词介词连词非谓语动词情态动词连系动词疑问句
    # 祈使句感叹句否定句倒装句强调句存在句省略句句子成分状语从句定语从句名词性从句一般现在时将来完成时将来进行时
    # 一般将来时过去进行时过去将来时过去完成时一般过去时现在完成时现在进行时时态综合主动语态被动语态虚拟语气
    # 比较等级独立主格主谓一致单词用法小学英语语法初中英语语法高中英语语法大学英语语法
    i = 0
    for list in list_subnav:
        url_subnav = list["url_subnav"]
        title_subnav = list["title_subnav"]
        page_text = requests.get(url=url_subnav, headers=header)
        page_text.encoding = "gb2312"
        page_text = page_text.text
        soup = BeautifulSoup(page_text, "lxml")
        list_url_title = []
        li_list_article = soup.select('.list-article > ul > li')
        # 本页面
        for li in li_list_article:
            dict_li = {}
            title_name = li.find_all("a")[1].text
            url_name = "http://www.yingyuyufa.com" + li.find_all("a")[1].get("href")
            dict_li['title_name'] = title_name
            dict_li['url_name'] = url_name
            list_url_title.append(dict_li)
        # print(list_url_title)
        li_list_pages = soup.select('.pages > ul > li')
        for li in li_list_pages:
            if li.a and li.a.text != "下一页" and li.a.text != "末页":
                url_pages = url_subnav + li.a.get("href")
                # print(url_pages)
                page_text = requests.get(url=url_pages, headers=header)
                page_text.encoding = "gb2312"
                page_text = page_text.text
                soup = BeautifulSoup(page_text, "lxml")
                li_list_article = soup.select('.list-article > ul > li')
                for li in li_list_article:
                    dict_li = {}
                    title_name = li.find_all("a")[1].text
                    url_name = "http://www.yingyuyufa.com" + li.find_all("a")[1].get("href")
                    dict_li['title_name'] = title_name
                    dict_li['url_name'] = url_name
                    list_url_title.append(dict_li)
        #
        print(title_subnav)
        print(url_subnav)
        # print(list_url_title)
        document = Document()  #
        document.add_heading(title_subnav, level=1)
        document.add_heading(url_subnav, level=1)
        # document.add_heading('英语中名词作状语的用法详解, level = 2', level=2)
        # document.add_paragraph('Intense quote')
        # document.save(title_subnav+'.doc')
        # i = i + 1
        # if i == 3:
        #     break
        for list in list_url_title:
            url = list["url_name"]
            header = {
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
            }
            # URL
            url = list["url_name"]
            # 请求命令
            page_text = requests.get(url=url, headers=header)
            page_text.encoding = "gb2312"
            page_text = page_text.text
            # print(page_text)
            soup = BeautifulSoup(page_text, "lxml")
            # article_title = soup.find('div', class_='article-title').text
            article_title = soup.select('.article-title')
            article_title = article_title[0].h1.text  # 取消掉时间作者
            print(article_title)
            document.add_heading(article_title, level=2)
            document.add_paragraph("网址是:" + url)
            # fp.write(article_title + '\n')
            list = soup.select('.content > p')
            for i in list:
                # fp.write(i.text + '\n')
                document.add_paragraph(i.text)
            document.add_paragraph('\n')
        document.save(title_subnav+'.doc')

0919_yingyuyufa_all_docx_单词用法.py

#!usr/bin/env
# -*-coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from docx import Document
import re
if __name__ == "__main__":
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
    }
    url ="http://www.yingyuyufa.com/yongfa/"
    page_text =requests.get(url=url, headers=header)
    page_text.encoding = "gb2312"
    page_text = page_text.text
    soup = BeautifulSoup(page_text, "lxml")
    list_url_title = []
    url_list=[]
    for i in range(2,25):
        url="http://www.yingyuyufa.com/yongfa/list_42_"+str(i)+".html"
        url_list.append(url)
    for url_pages in url_list:
        page_text = requests.get(url=url_pages, headers=header)
        page_text.encoding = "gb2312"
        page_text = page_text.text
        soup = BeautifulSoup(page_text, "lxml")
        li_list_article = soup.select('.list-article > ul > li')
        for li in li_list_article:
            dict_li={}
            title_name = li.find_all("a")[1].text
            url_name = "http://www.yingyuyufa.com"+li.find_all("a")[1].get("href")
            dict_li['title_name'] = title_name
            dict_li['url_name'] = url_name
            list_url_title.append(dict_li)
    print(list_url_title)
    fp = open('.主页_词性_名词.txt', 'w', encoding='utf-8')
    document = Document() #
    for list in list_url_title:
        url = list["url_name"]
        header = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        }
        # URL
        url = list["url_name"]
        page_text = requests.get(url=url, headers=header)
        page_text.encoding = "gb2312"
        page_text = page_text.text
        print(page_text)
        soup = BeautifulSoup(page_text, "lxml")
        article_title = soup.select('.article-title')
        article_title = article_title[0].h1.text  # 取消掉时间作者
        document.add_heading(article_title, level=1)
        document.add_paragraph("网址是:" + url)
        list = soup.select('.content > p')
        for i in list:
            # fp.write(i.text + '\n')
            document.add_paragraph(i.text)
        document.add_paragraph('\n')
    document.save('单词用法_.doc')

0919_yingyuyufa_all_docx_数词.py

#!usr/bin/env
# -*-coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from docx import Document
import re
if __name__ == "__main__":
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
    }
    # URL
    url ="http://www.yingyuyufa.com/cixing/shuci/"
    # url ="http://www.yingyuyufa.com/cixing/guanci/"
    # 请求命令
    page_text =requests.get(url=url, headers=header)
    page_text.encoding = "gb2312"
    page_text = page_text.text
    soup = BeautifulSoup(page_text, "lxml")
    list_url_title = []
    li_list_article = soup.select('.list-article > ul > li')
    for li in li_list_article:
        dict_li={}
        title_name = li.find_all("a")[1].text
        url_name = "http://www.yingyuyufa.com"+li.find_all("a")[1].get("href")
        dict_li['title_name'] = title_name
        dict_li['url_name'] = url_name
        list_url_title.append(dict_li)
    # print(list_url_title)

    li_list_pages = soup.select('.pages > ul > li')
    for li in li_list_pages:
        if li.a and li.a.text != "下一页" and li.a.text != "末页":
            url_pages = url + li.a.get("href")
            # print(url_pages)
            page_text = requests.get(url=url_pages, headers=header)
            page_text.encoding = "gb2312"
            page_text = page_text.text
            soup = BeautifulSoup(page_text, "lxml")
            li_list_article = soup.select('.list-article > ul > li')
            for li in li_list_article:
                dict_li={}
                title_name = li.find_all("a")[1].text
                url_name = "http://www.yingyuyufa.com"+li.find_all("a")[1].get("href")
                dict_li['title_name'] = title_name
                dict_li['url_name'] = url_name
                list_url_title.append(dict_li)
    # print(list_url_title)
    # fp = open('.主页_词性_名词.txt', 'w', encoding='utf-8')
    document = Document() #
    for list in list_url_title:
        url = list["url_name"]
        header = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        }
        # URL
        url = list["url_name"]
        # print(url)
        # 请求命令
        page_text = requests.get(url=url, headers=header)
        page_text.encoding = "gb2312"
        page_text = page_text.text
        # print(page_text)
        soup = BeautifulSoup(page_text, "lxml")
        # article_title = soup.find('div', class_='article-title').text
        article_title = soup.select('.article-title')
        article_title = article_title[0].h1.text  # 取消掉时间作者
        print(article_title)
        document.add_heading(article_title, level=1)
        document.add_paragraph("网址是:" + url)
        # fp.write(article_title + '\n')
        list = soup.select('.content > p')
        for i in list:
            # fp.write(i.text + '\n')
            document.add_paragraph(i.text)
        document.add_paragraph('\n')
    document.save('主页_词性_数词.doc')

0919_yingyuyufa_all_docx_动词.py

#!usr/bin/env
# -*-coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
from docx import Document
import re
if __name__ == "__main__":
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
    }
    # URL
    url ="http://www.yingyuyufa.com/cixing/dongci/"
    # 请求命令
    page_text =requests.get(url=url, headers=header)
    page_text.encoding = "gb2312"
    page_text = page_text.text
    soup = BeautifulSoup(page_text, "lxml")
    list_url_title = []
    li_list_article = soup.select('.list-article > ul > li')
    for li in li_list_article:
        dict_li={}
        title_name = li.find_all("a")[1].text
        url_name = "http://www.yingyuyufa.com"+li.find_all("a")[1].get("href")
        dict_li['title_name'] = title_name
        dict_li['url_name'] = url_name
        list_url_title.append(dict_li)
    url_list=['http://www.yingyuyufa.com/cixing/dongci/list_6_2.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_3.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_4.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_5.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_6.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_7.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_8.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_9.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_10.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_1.html',
                'http://www.yingyuyufa.com/cixing/dongci/list_6_12.html']
    for url_pages in url_list:
        page_text = requests.get(url=url_pages, headers=header)
        page_text.encoding = "gb2312"
        page_text = page_text.text
        soup = BeautifulSoup(page_text, "lxml")
        li_list_article = soup.select('.list-article > ul > li')
        for li in li_list_article:
            dict_li={}
            title_name = li.find_all("a")[1].text
            url_name = "http://www.yingyuyufa.com"+li.find_all("a")[1].get("href")
            dict_li['title_name'] = title_name
            dict_li['url_name'] = url_name
            list_url_title.append(dict_li)
    # print(list_url_title)
    # fp = open('.主页_词性_名词.txt', 'w', encoding='utf-8')
    document = Document() #
    for list in list_url_title:
        url = list["url_name"]
        header = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        }
        # URL
        url = list["url_name"]
        # 请求命令
        page_text = requests.get(url=url, headers=header)
        page_text.encoding = "gb2312"
        page_text = page_text.text
        print(page_text)
        soup = BeautifulSoup(page_text, "lxml")
        # article_title = soup.find('div', class_='article-title').text
        article_title = soup.select('.article-title')
        article_title = article_title[0].h1.text  # 取消掉时间作者
        document.add_heading(article_title, level=1)
        document.add_paragraph("网址是:" + url)
        # fp.write(article_title + '\n')
        list = soup.select('.content > p')
        for i in list:
            # fp.write(i.text + '\n')
            document.add_paragraph(i.text)
        document.add_paragraph('\n')
    document.save('主页_词性_动词.doc')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
您好!对于Python爬虫,使用BeautifulSoup (通常简写为bs4)是一个常见的选择。BeautifulSoup是一个解析HTML和XML文档的Python库,它提供了简单又灵活的方式来遍历和搜索文档树。 要使用BeautifulSoup,您需要在Python环境中安装该库。可以使用pip命令来安装: ``` pip install beautifulsoup4 ``` 安装完成后,您可以开始编写爬虫代码。首先,您需要导入BeautifulSoup模块和相应的库,例如requests用于发送HTTP请求: ```python from bs4 import BeautifulSoup import requests ``` 然后,您可以使用requests库发送HTTP请求获取页面内容,并将其传递给BeautifulSoup进行解析: ```python url = 'https://www.example.com' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') ``` 在这个例子中,我们使用requests库发送GET请求来获取https://www.example.com网页的内容。然后,我们将返回的内容传递给BeautifulSoup构造函数进行解析,并指定解析器为'html.parser'。 接下来,您可以使用BeautifulSoup提供的方法来遍历和搜索文档树,以提取所需的数据。例如,您可以使用find()或find_all()方法来查找特定的元素或标签: ```python # 查找第一个<div>标签 tag = soup.find('div') # 查找所有<a>标签 tags = soup.find_all('a') ``` 这只是Python爬虫使用BeautifulSoup的基本操作示例。您可以根据需要进一步学习和探索BeautifulSoup的功能以及其他相关库。 希望这能帮到您!如果您有更多问题,请随时提问。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值