bs4获取表格数据并写入Excel以及xpath的使用

库:Beautifulsoup requests 

代码部分:



from bs4 import BeautifulSoup
import requests
f = open("菜价表格爬取.csv",mode='w',encoding='utf-8')
headers = {
    'user-agent': '自己的ua'}

url = 'http://zhongdapeng.com/shucaijiage/1072.html'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8' #第一步就解码,不要解码text文本,他默认utf-8,不能重复解码
# html = html.encode('gb2312') document.charse查看解码方式,在控制台输入即可

html = response.text


page = BeautifulSoup(html, "html.parser")
table = page.find("tbody")
#attrs={"width": "200", "height": "38", "align": "left", "valign": "middle"} td
# print(table)
# print(type(table))
trs = table.find_all("tr")[1:]

# print(trs[0].text)
for i in trs:
    xxi = i.text
    # print(len(xxi))
    # xxi = xxi.split(" ")
    xxi = xxi.split("\n")
    xuhao1 = xxi[2]
    name1 = xxi[4]
    jg1 = xxi[6]

    xuhao2 = xxi[8]
    name2 = xxi[10]
    jg2 = xxi[12]
    print(xuhao1, name1, jg1)
    print(xuhao2, name2, jg2)
    f.write(f"{xuhao1}, {name1}, {jg1}\n")
    f.write(f"{xuhao2}, {name2}, {jg2}\n")
f.close()
response.close()
    # print(xxi)









# print(response.text)
# name = re.findall('td width="200" height="38" align="left" valign="middle"><a  target="_blank" href=".*?" class="opu_l1">(.*?)</a></td>', response.text)
# print(name)



# print(response.text)

# soup = BeautifulSoup(html, 'lxml')
# # print(soup.title)
# # print(soup.head)
# # print(soup.a)
# # print(soup.p)
# # print(type(soup.a)) #<class 'bs4.element.Tag'>
# print(soup.name)
# print(soup.table)



"html.parser"指的是解析方法,<table (可以加上属性)>用css选择器获取所需信息,放弃re的匹配方式

xpath的部分:

 

import re
from lxml import etree


import requests
f = open("菜价表格爬取.csv",mode='w',encoding='utf-8')
headers = {
    'user-agent': '自己的'}

url = 'http://zhongdapeng.com/shucaijiage/1072.html'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8' #第一步就解码,不要解码text文本,他默认utf-8,不能重复解码
html = response.text
et = etree.HTML(html)
# li_list = et.xpath("//tbody/tr/td[@valign='top']/p/text()")
# print(li_list)
li_list = et.xpath("//tbody/tr")
for i in li_list:
    tds = i.xpath("./td[@valign='top']/p/text()")
    if(tds!=[]):

        print(tds)
    else:
        pass

 

本文仅供技术分享讨论,无其他用途

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值