5i5j网页数据获取及存储

from urllib import request
from lxml import etree
from pymysql_conn import Mysql_connect


# 比较得出url 地址规律
base_url = 'https://bj.5i5j.com/zufang/huilongguan/n{}'

# https://bj.5i5j.com/zufang/huilongguan/n2/
# https://bj.5i5j.com/zufang/huilongguan/n3/

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
    'Cookie': '_Jo0OQK=791C6F15B09A2D259A9391EF5EEC575BFD1F7A88AAC94A312F1AD3242E4DA293BCD1C7681747AC76497EFE5D65969E662543E39F8380D5847F563B66A58E82A9D79DE8682CA7D10E3B498FB9E3C853EFEE298FB9E3C853EFEE215D8BEE34E43E5C0GJ1Z1Xw==; PHPSESSID=g6oa0bq1hbcemnudl12l3sb4gf; domain=bj; _ga=GA1.2.378878987.1534505423; _gid=GA1.2.981370027.1534505423; _gat=1; yfx_c_g_u_id_10000001=_ck18081719302215847544867586432; yfx_f_l_v_t_10000001=f_t_1534505422575__r_t_1534505422575__v_t_1534505422575__r_c_0; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534505424; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534505424'
}
# 准备sql 语句
sql = 'insert into data_5i5j (title,region,square,zone0,price) VALUES ({},{},{},{},{})'
# 创建数据库对象
mysql_object = Mysql_connect()

for i in range(1,4):
    # 获得前3页数据
    url = base_url.format(i)
    req = request.Request(url, headers=headers)
    response = request.urlopen(req).read()
    html_str = response.decode('utf-8')
    # 生成etree对象
    html_ele = etree.HTML(html_str)
    # 由html_ele查找li_ele
    li_list = html_ele.xpath('//ul[@class="pList"]/li')
    for li_ele in li_list:
        # 通过xpath查找所需数据
        title = li_ele.xpath('./div[2]/h3/a')[0].text
        region = li_ele.xpath('./div[2]/div[1]/p[1]/text()')[0].split('  ·  ')[0].replace('  ','')
        square = li_ele.xpath('./div[2]/div[1]/p[1]/text()')[0].split('  ·  ')[1].replace('  ','')[0:-2]
        # print(square)
        zone = li_ele.xpath('./div[2]/div[1]/p[2]/a')[0].text
        price = li_ele.xpath('./div[2]/div[1]/div/p[1]/strong')[0].text
        # data = (title,region,square,zone,price)
        # 写入数据库
        mysql_object.execute_sql(sql.format(repr(title),repr(region),repr(square),repr(zone),repr(price)))
import pymysql

class Mysql_connect(object):
    def __init__(self):
        self.db = pymysql.connect(host='127.0.0.1',user='root',password='123456',port=3306,database='db815',charset='utf8')
        self.cursor = self.db.cursor()

    def execute_sql(self,sql):
        self.cursor.execute(sql)
        self.db.commit()

    def __del__(self):
        self.cursor.close()
        self.db.close()


if __name__ == '__main__':
    con_test = Mysql_connect()
    sql = 'insert into xueqiu (id)value(3)'
    con_test.execute_sql(sql)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值