import requests
import re
from lxml import etree
import pymysql
import xlwt
import pandas as pd
class DBSendRequest:
def __init__(self):
self.url = ''
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
'Cookie': 'firstWord=%u7F51%u7EDC%u722C%u866B; JSESSIONID=D48FD91EAB2F110581C6F6E518E473D8; TS01c20281=01a6b27f08842583690338c8b44c4a12a0eb858e29ffccddb97eca441b6e0356ac926538c2ddcd9467e5b84d21e19b96c9e7d669b1; userSearch=siteCode-N000005434&column-%E5%85%A8%E9%83%A8&uc-0&firstWord-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&searchWord-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB; TS0184b316=01a6b27f08842583690338c8b44c4a12a0eb858e29ffccddb97eca441b6e0356ac926538c2ddcd9467e5b84d21e19b96c9e7d669b1; Hm_lvt_d7c7037093938390bc160fc28becc542=1687968005,1688201468; Hm_lpvt_d7c7037093938390bc160fc28becc542=1688201468; TSd84ad2c7027=0886aacbbeab2000f74d9e06f8354b310125101d999e9f5610c902b924fabafe201e06f2e8753919085c92f583113000a1832ca937dec26900397a34d18810404499f43fe77a3f98372a2d00217e5b675744aa26abac9e24ad7b9ea0388b97f4'
}
@property # 该装饰器将方法变成一个可直接调用的属性
def sendRequest(self):
response = requests.get(self.url, headers=self.headers)
cookies = response.cookies
return response
@sendRequest.setter
def sendRequest(self, url):
self.url = url
class DealData(DBSendRequest):
def indexDealdata(self):
self.sendRequest = "https://www.spp.gov.cn/guestweb/s?siteCode=N000005434&searchWord=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB"
response = self.sendRequest
text_data = response.content.decode()
tree = etree.HTML(text_data)
stype = tree.xpath('//*[@id="showPage"]/div/div[1]/a/text()')
titel = tree.xpath('//*[@id="showPage"]/div/div[1]/h3/a/@title')
return text_data
def detail(self):
data = self.indexDealdata()
tree = etree.HTML(data)
stype = tree.xpath('//*[@id="showPage"]/div/div[1]/a/text()')
titel = tree.xpath('//*[@id="showPage"]/div/div[1]/h3/a/@title')
result = {}
df = pd.DataFrame(columns=['类型', '数据'])
for st, ti in zip(stype, titel):
result = {'leixing': st, 'title': ti}
# print(result)
sql = f"insert into rmjcy(leixin, title) values ('{st}', '{ti}')"
df.loc[len(df.index)] = [st, ti]
with con.cursor() as cursor:
cursor.execute(sql)
con.commit()
sql2 = "select * from rmjcy"
cursor.execute(sql2)
datas = cursor.fetchall()
print(datas)
df.to_excel('人民检察院2.xls', sheet_name="人民检察院2.xls", na_rep="")
# return result
if __name__ == '__main__':
con = pymysql.connect(host='localhost', password='Www.1.com', port=3306, user='root',database='lle_test', charset='utf8')
con.connect()
dealdata = DealData()
dealdata.detail()
python爬虫获取网站信息--存入mysql和excel中
于 2023-07-01 18:17:12 首次发布
该代码示例展示了一个使用Python进行网页抓取的程序,它结合了requests库获取网页内容,lxml库解析HTML,以及pymysql库与MySQL数据库进行交互,将抓取的数据存储到数据库中。
306

被折叠的 条评论
为什么被折叠?



