库:Beautifulsoup requests
代码部分:
from bs4 import BeautifulSoup
import requests
f = open("菜价表格爬取.csv",mode='w',encoding='utf-8')
headers = {
'user-agent': '自己的ua'}
url = 'http://zhongdapeng.com/shucaijiage/1072.html'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8' #第一步就解码,不要解码text文本,他默认utf-8,不能重复解码
# html = html.encode('gb2312') document.charse查看解码方式,在控制台输入即可
html = response.text
page = BeautifulSoup(html, "html.parser")
table = page.find("tbody")
#attrs={"width": "200", "height": "38", "align": "left", "valign": "middle"} td
# print(table)
# print(type(table))
trs = table.find_all("tr")[1:]
# print(trs[0].text)
for i in trs:
xxi = i.text
# print(len(xxi))
# xxi = xxi.split(" ")
xxi = xxi.split("\n")
xuhao1 = xxi[2]
name1 = xxi[4]
jg1 = xxi[6]
xuhao2 = xxi[8]
name2 = xxi[10]
jg2 = xxi[12]
print(xuhao1, name1, jg1)
print(xuhao2, name2, jg2)
f.write(f"{xuhao1}, {name1}, {jg1}\n")
f.write(f"{xuhao2}, {name2}, {jg2}\n")
f.close()
response.close()
# print(xxi)
# print(response.text)
# name = re.findall('td width="200" height="38" align="left" valign="middle"><a target="_blank" href=".*?" class="opu_l1">(.*?)</a></td>', response.text)
# print(name)
# print(response.text)
# soup = BeautifulSoup(html, 'lxml')
# # print(soup.title)
# # print(soup.head)
# # print(soup.a)
# # print(soup.p)
# # print(type(soup.a)) #<class 'bs4.element.Tag'>
# print(soup.name)
# print(soup.table)
"html.parser"指的是解析方法,<table (可以加上属性)>用css选择器获取所需信息,放弃re的匹配方式
xpath的部分:
import re
from lxml import etree
import requests
f = open("菜价表格爬取.csv",mode='w',encoding='utf-8')
headers = {
'user-agent': '自己的'}
url = 'http://zhongdapeng.com/shucaijiage/1072.html'
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8' #第一步就解码,不要解码text文本,他默认utf-8,不能重复解码
html = response.text
et = etree.HTML(html)
# li_list = et.xpath("//tbody/tr/td[@valign='top']/p/text()")
# print(li_list)
li_list = et.xpath("//tbody/tr")
for i in li_list:
tds = i.xpath("./td[@valign='top']/p/text()")
if(tds!=[]):
print(tds)
else:
pass
本文仅供技术分享讨论,无其他用途