文章目录
前言
为了进行数据分析,需要搜集大量相关信息,运用python把数据存进excel文档,再运用sdata进行match
一、python爬数据
二、使用步骤
1.UA伪装
代码如下:
headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36' }
2.读入网页数据,防止乱码
代码如下:
url = "https://www.ricedata.cn/variety/identified/uhhl_%s.htm" %page res = requests.get(url) res.encoding = res.apparent_encoding text = res.text
3.网页解析,创建csv文件
代码如下:
main_page = BeautifulSoup(text, "html.parser") f = open("ricedatashanghai.csv", mode = "a")
4.寻找目标内容
代码如下:
table = main_page.find("table", attrs={"cellpadding":"2"}) trs = table.find_all("tr") for tr in trs: lst = tr.find_all("td") if len(lst) != 0: for td in lst: #print(td.text) f.write(td.text.strip()) f.write(",") f.write("\n")
5.定义一个函数实现多页循环,要找到网页规律
代码如下:
table = main_page.find("table", attrs={"cellpadding":"2"}) trs = table.find_all("tr") for tr in trs: lst = tr.find_all("td") if len(lst) != 0: for td in lst: #print(td.text) f.write(td.text.strip()) f.write(",") f.write("\n")
for page in range(1,6): down(page)
import sys
from typing import io
import requests
from bs4 import BeautifulSoup
#UA伪装
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36'
}
def down(page):
#url = "https://www.ricedata.cn/variety/identified/vejd_%s.htm" % pag
#text = requests.get("https://www.ricedata.cn/variety/identified/vejd_%s.htm" % page, headers =headers).text
url = "https://www.ricedata.cn/variety/identified/jdsu_%s.htm" %page
res = requests.get(url)
res.encoding = res.apparent_encoding
text = res.text
# text = requests.get("https://www.ricedata.cn/variety/identified/vejd_2.htm", headers = headers).text
main_page = BeautifulSoup(text, "html.parser")
f = open("ricedatajiangsu.csv", mode = "a")
#table = main_page.find("table", attrs={"style": "font-family:Times New Roman; border-top:solid black 2px; border-bottom:solid black 2px"})
table = main_page.find("table", attrs={"cellpadding":"2"})
trs = table.find_all("tr")
for tr in trs:
lst = tr.find_all("td")
if len(lst) != 0:
for td in lst:
#print(td.text)
f.write(td.text.strip())
f.write(",")
f.write("\n")
for page in range(1,19):
down(page)
import sys from typing import io import requests from bs4 import BeautifulSoup #UA伪装 headers = { 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36' } def down(page): #url = "https://www.ricedata.cn/variety/identified/vejd_%s.htm" % pag #text = requests.get("https://www.ricedata.cn/variety/identified/vejd_%s.htm" % page, headers =headers).text url = "https://www.ricedata.cn/variety/identified/uhhl_%s.htm" %page res = requests.get(url) res.encoding = res.apparent_encoding text = res.text # text = requests.get("https://www.ricedata.cn/variety/identified/vejd_2.htm", headers = headers).text main_page = BeautifulSoup(text, "html.parser") f = open("ricedatashanghai.csv", mode = "a") #table = main_page.find("table", attrs={"style": "font-family:Times New Roman; border-top:solid black 2px; border-bottom:solid black 2px"}) table = main_page.find("table", attrs={"cellpadding":"2"}) trs = table.find_all("tr") for tr in trs: lst = tr.find_all("td") if len(lst) != 0: for td in lst: #print(td.text) f.write(td.text.strip()) f.write(",") f.write("\n") for page in range(1,6): down(page)