python之：BeautifulSoup

最新推荐文章于 2024-07-08 16:42:42 发布

Surpass-HC

最新推荐文章于 2024-07-08 16:42:42 发布

阅读量242

点赞数

本文链接：https://blog.csdn.net/hxchuan000/article/details/111225850

版权

1.个人没玩好lxml，这里就用BeautifulSoup

2.用一段程序来开始：

import pandas as pd
import lxml.html
from lxml import etree
import re
import time
from urllib.request import urlopen, Request
import sys
import datetime
from bs4 import BeautifulSoup

#BeautifulSoup参考：https://cuiqingcai.com/1319.html

def printdbg(pstr = ""):
    dt_ms = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
    print("[%s][%s:%s:%s]%s"%(dt_ms, sys._getframe().f_back.f_code.co_filename, 
        sys._getframe().f_back.f_code.co_name, sys._getframe().f_back.f_lineno, pstr))
def getdianying():
    request = Request("https://www.ygdy8.com/html/gndy/dyzz/index.html")
    text = urlopen(request, timeout=10).read()
    print("===========================================")
    soup = BeautifulSoup(text, 'html.parser')#'lxml','xml'
    print(type(soup))
    tables = soup.find_all('table', attrs = {'class':'tbspan'})
    # soup.find_all('a',attrs={'id':'test','class':'test'})
    # tables = soup.find_all('table', class_ = 'tbspan')
    for node in tables:
        # print(type(node))#打印类型
        #print(node.a.string)#获得a标签的内容
        print(node.find_all('a')[0].string)#获得a标签的内容
        #print(node.find_all('a')[0]['href'])#获得a标签的href属性的值
        print(node.find_all('a')[0].get('href'))#获得a标签的href属性的值
        print(node.find_all('font')[0].string)#获得font标签的内容
        print(node.find_all('td', attrs = {'colspan':'2', 'style':'padding-left:3px'})[0].string)
        #print(node.find_all('a')[0].attrs)#打印子属性
        print("--------------------------------------------------------------->")
def getreport():
    request = Request("https://s.askci.com/stock/a/0-0?reportTime=2020-09-30&pageNum=1#QueryCondition")
    text = urlopen(request, timeout=10).read()
    soup = BeautifulSoup(text, 'html.parser')#'lxml','xml'
    tables = soup.find_all('table', attrs = {'class':'fancyTable', 'id':'myTable04'})
    #print(tables[0])
    df = pd.read_html(str(tables[0]))[0]
    #df = pd.read_html(text))[0]//如果只有一个表格，可以直接使用这个
    print(df)
getdianying()
getreport()

使用Request请求网页，BeautifulSoup使用html.parser来解析，find_all查找属性class='tbspan'的table。

打印结果：

2020年剧情传记《曼克》BD中英双字幕
/html/gndy/dyzz/20201213/60839.html
日期：2020-12-13 02:10:24 
点击：0 
◎译 名 曼克/曼凯 ◎片 名 Mank ◎年 代 2020 ◎产 地 美国 ◎类 别 剧情 / 传记 ◎语 言 英语,德语,拉丁语 ◎字 幕 中英双字幕 ◎上映日期 2020-11-13(美国) / 2020-12-04(美国网络) ◎IMDb评分7.3/10 from 17237 users ◎豆瓣评分 7.6/10 from 3749 users ◎文件格
--------------------------------------------------------------->
2020年爱情悬疑《幻爱》BD粤语中字
/html/gndy/dyzz/20201212/60832.html
日期：2020-12-11 00:21:34 
点击：0 
◎译 名 幻爱/Beyond the Dream ◎片 名 幻�� ◎年 代 2019 ◎产 地 中国香港 ◎类 别 爱情/悬疑 ◎语 言 粤语 ◎字 幕 中文 ◎上映日期 2019-11-14(香港亚洲电影节) / 2020-07-02(中国香港) ◎IMDb评分7.6/10 from 267 users ◎豆瓣评分 7.4/10 from 863 users ◎文件
--------------------------------------------------------------->
..................

要得到表格还可以参考panda的df = panda.read_html(text)[0]