python爬虫获取网站信息--存入mysql和excel中

该代码示例展示了一个使用Python进行网页抓取的程序,它结合了requests库获取网页内容,lxml库解析HTML,以及pymysql库与MySQL数据库进行交互,将抓取的数据存储到数据库中。
import requests
import re
from lxml import etree
import pymysql
import xlwt
import pandas as pd

class DBSendRequest:
    def __init__(self):
        self.url = ''
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
            'Cookie': 'firstWord=%u7F51%u7EDC%u722C%u866B; JSESSIONID=D48FD91EAB2F110581C6F6E518E473D8; TS01c20281=01a6b27f08842583690338c8b44c4a12a0eb858e29ffccddb97eca441b6e0356ac926538c2ddcd9467e5b84d21e19b96c9e7d669b1; userSearch=siteCode-N000005434&column-%E5%85%A8%E9%83%A8&uc-0&firstWord-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&searchWord-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB; TS0184b316=01a6b27f08842583690338c8b44c4a12a0eb858e29ffccddb97eca441b6e0356ac926538c2ddcd9467e5b84d21e19b96c9e7d669b1; Hm_lvt_d7c7037093938390bc160fc28becc542=1687968005,1688201468; Hm_lpvt_d7c7037093938390bc160fc28becc542=1688201468; TSd84ad2c7027=0886aacbbeab2000f74d9e06f8354b310125101d999e9f5610c902b924fabafe201e06f2e8753919085c92f583113000a1832ca937dec26900397a34d18810404499f43fe77a3f98372a2d00217e5b675744aa26abac9e24ad7b9ea0388b97f4'
        }

    @property   # 该装饰器将方法变成一个可直接调用的属性
    def sendRequest(self):
        response = requests.get(self.url, headers=self.headers)
        cookies = response.cookies
        return response

    @sendRequest.setter
    def sendRequest(self, url):
        self.url = url


class DealData(DBSendRequest):
    def indexDealdata(self):
        self.sendRequest = "https://www.spp.gov.cn/guestweb/s?siteCode=N000005434&searchWord=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB"
        response = self.sendRequest
        text_data = response.content.decode()
        tree = etree.HTML(text_data)
        stype = tree.xpath('//*[@id="showPage"]/div/div[1]/a/text()')
        titel = tree.xpath('//*[@id="showPage"]/div/div[1]/h3/a/@title')
        return text_data
    def detail(self):
        data = self.indexDealdata()
        tree = etree.HTML(data)
        stype = tree.xpath('//*[@id="showPage"]/div/div[1]/a/text()')
        titel = tree.xpath('//*[@id="showPage"]/div/div[1]/h3/a/@title')
        result = {}
        df = pd.DataFrame(columns=['类型', '数据'])

        for st, ti in zip(stype, titel):
            result = {'leixing': st, 'title': ti}
            # print(result)
            sql = f"insert into rmjcy(leixin, title) values ('{st}', '{ti}')"
            df.loc[len(df.index)] = [st, ti]
            with con.cursor() as cursor:
                cursor.execute(sql)
                con.commit()
                sql2 = "select * from rmjcy"
                cursor.execute(sql2)
                datas = cursor.fetchall()
                print(datas)
        df.to_excel('人民检察院2.xls', sheet_name="人民检察院2.xls", na_rep="")

            # return result


if __name__ == '__main__':
    con = pymysql.connect(host='localhost', password='Www.1.com', port=3306, user='root',database='lle_test', charset='utf8')
    con.connect()
    dealdata = DealData()
    dealdata.detail()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值