python requests 爬取链家二手房 存入mysql

import requests
import re
import csv
import time
import warnings
import pymysql

class LianjiaSpider:
    
    def __init__(self):
        self.baseurl = 'https://bj.lianjia.com/ershoufang/pg/'
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}
       # 创建连接对象
        self.db = pymysql.connect('localhost','root','123456',charset='utf8')
        # 创建游标
        self.cursor = self.db.cursor()
    
    
    
    def getPage(self,url):
        res = requests.get(url,headers=self.headers)
        res.encoding='urf-8'
        self.parsePage(res.text)
        time.sleep(0.2)
    
    
    def parsePage(self,html):
        patterns = 'div class="title">.*?<a class="".*? data-is_focus="1" data-sl="">(.*?)</a>.*?<div class="houseInfo">.*?<a .*? data-el="region">(.*?)</a><span class="divide">/</span>(.*?)<span class="divide">/</span>(.*?)<span class="divide">/</span>(.*?)<span class="divide">.*?<div class="flood"><div class="positionInfo">(.*?)<span class="divide">/</span>(.*?)<span class="divide">/</span><a .*? target="_blank">(.*?)</a>.*?<div class="totalPrice"><span>(.*?)</span>'
        r = re.compile(patterns,re.S)
        r_list = r.findall(html)
        self.writeToText(r_list)
        
    
    def writeToText(self,r_list):
        c_db = 'create database if not exists lianjia charset utf8'
        u_db = 'use lianjia'
        c_table = """create table if not exists ershoufang(id int primary key auto_increment,
                    title varchar(100),
                    address varchar(100),
                    model varchar(100),
                    size varchar(100),
                    direction varchar(100),
                    floor varchar(30),
                    start_year varchar(20),
                    position varchar(30),
                    price varchar(10));
                """
        ins = 'insert into ershoufang(title,address,model,size,direction,floor,start_year,position,price)values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        warnings.filterwarnings('ignore')
        try:
            self.cursor.execute(c_db)
            self.cursor.execute(u_db)
            self.cursor.execute(c_table)
        except:
            pass

        for r_tuple in r_list:
            L=[]
            for r_str in r_tuple:
                L.append(r_str.strip())
            print(L)
            # execute(ins,[列表])
            self.cursor.execute(ins, L)
            self.db.commit()
            print('存入数据库成功')
    
    def workOn(self):
#        self.getPage(self.baseurl)
        num = int(input('请输入要爬取数据页数:\n'))
        for n in range(1,num+1):
            url = self.baseurl+str(n)
            self.getPage(url)
            print('第'+str(n)+'页,爬取成功')
		self.cursor.close()
		self.db.close()
            
            
        
    
if __name__ =='__main__':
    spider = LianjiaSpider()
    spider.workOn()

运行结果:

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值