import requests
import re
import csv
import time
import warnings
import pymysql
class LianjiaSpider:
def __init__(self):
self.baseurl = 'https://bj.lianjia.com/ershoufang/pg/'
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER'}
# 创建连接对象
self.db = pymysql.connect('localhost','root','123456',charset='utf8')
# 创建游标
self.cursor = self.db.cursor()
def getPage(self,url):
res = requests.get(url,headers=self.headers)
res.encoding='urf-8'
self.parsePage(res.text)
time.sleep(0.2)
def parsePage(self,html):
patterns = 'div class="title">.*?<a class="".*? data-is_focus="1" data-sl="">(.*?)</a>.*?<div class="houseInfo">.*?<a .*? data-el="region">(.*?)</a><span class="divide">/</span>(.*?)<span class="divide">/</span>(.*?)<span class="divide">/</span>(.*?)<span class="divide">.*?<div class="flood"><div class="positionInfo">(.*?)<span class="divide">/</span>(.*?)<span class="divide">/</span><a .*? target="_blank">(.*?)</a>.*?<div class="totalPrice"><span>(.*?)</span>'
r = re.compile(patterns,re.S)
r_list = r.findall(html)
self.writeToText(r_list)
def writeToText(self,r_list):
c_db = 'create database if not exists lianjia charset utf8'
u_db = 'use lianjia'
c_table = """create table if not exists ershoufang(id int primary key auto_increment,
title varchar(100),
address varchar(100),
model varchar(100),
size varchar(100),
direction varchar(100),
floor varchar(30),
start_year varchar(20),
position varchar(30),
price varchar(10));
"""
ins = 'insert into ershoufang(title,address,model,size,direction,floor,start_year,position,price)values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
warnings.filterwarnings('ignore')
try:
self.cursor.execute(c_db)
self.cursor.execute(u_db)
self.cursor.execute(c_table)
except:
pass
for r_tuple in r_list:
L=[]
for r_str in r_tuple:
L.append(r_str.strip())
print(L)
# execute(ins,[列表])
self.cursor.execute(ins, L)
self.db.commit()
print('存入数据库成功')
def workOn(self):
# self.getPage(self.baseurl)
num = int(input('请输入要爬取数据页数:\n'))
for n in range(1,num+1):
url = self.baseurl+str(n)
self.getPage(url)
print('第'+str(n)+'页,爬取成功')
self.cursor.close()
self.db.close()
if __name__ =='__main__':
spider = LianjiaSpider()
spider.workOn()
运行结果: