一个爬虫示例
使用到的库
- requests 网络请求
- parsel 网页解析
- pandas 数据处理和存储
- pymysql、dbutils 数据持久化
其中MySQL存储用的是一个封装好的API类
说明
- 数据的存储会有两种模式,一种是直接保存为excel,一种是保存到MySQL数据库
- 请通过
write_type
来控制模式 excel or mysql
- 为了方便测试,最后会提供简单的DDL创建语句
代码示例
from fake_useragent import UserAgent
import requests
import pandas as pd
import re
import time
import datetime
from parsel import Selector
from mysql_client_tools import MySQLClient
class HomeLinkSpider(object):
write_type = "excel"
def __init__(self, db_config):
self.ua = UserAgent()
self.headers = {"User-Agent": self.ua.random}
self.data = list()
self.path_name = "浦东_三房_500_800万"
self.url = "https://sh.lianjia.com/ershoufang/pudong/a3p5/"
self.mc = MySQLClient(**db_config)
def get_max_page(self):
response = requests.get(self.url, headers=self.headers)
if response.status_code == 200:
selector = Selector(response.text)
a = selector.css('div[class="page-box house-lst-page-box"]')
max_page = eval(a[0].xpath('//@page-data').get())["totalPage"]
print("最大页码数:{}".format(max_page))
return 3
else:
print("请求失败 status:{}".format(response.status_code))
return None
def parse_page(self):
max_page = self.get_max_page()
for i in range(1, max_page + 1):
url = 'https://sh.lianjia.com/ershoufang/pudong/pg{}a3p5/'.format(i)
response = requests.get(url, headers=self.headers)
selector = Selector(response.text)
ul = selector.css('ul.sellListContent')[0]
li_list = ul.css('li')
for li in li_list:
detail = dict()
detail['title'] = li.css('div.title a::text').get()
house_info = li.css('div.houseInfo::text').get()
house_info_list = house_info.split(" | ")
detail['bedroom'] = house_info_list[0]
detail['area'] = house_info_list[1]
detail['direction'] = house_info_list[2]
floor_pattern = re.compile(r'\d{1,2}')
match1 = re.search(floor_pattern, house_info_list[4])
if match1:
detail['floor'] = str(match1.group())
else:
detail['floor'] = "未知"
year_pattern = re.compile(r'\d{4}')
match2 = re.search(year_pattern, house_info_list[5])
if match2:
detail['year'] = str(match2.group())
else:
detail['year'] = "未知"
position_info = li.css('div.positionInfo a::text').getall()
detail['house'] = position_info[0]
detail['location'] = position_info[1]
price_pattern = re.compile(r'\d+')
total_price = li.css('div.totalPrice span::text').get()
detail['total_price'] = str(re.search(price_pattern, total_price).group())
unit_price = li.css('div.unitPrice span::text').get()
detail['unit_price'] = str(re.search(price_pattern, unit_price).group())
pic_code = li.css('a.noresultRecommend ::attr(data-housecode)').get()
pic_url = li.css('a.noresultRecommend img.lj-lazy ::attr(data-original)').get()
detail['pic_code'] = pic_code
detail['pic_url'] = pic_url
if self.write_type == 'excel':
self.data.append(detail)
elif self.write_type == 'mysql':
self.write_to_mysql(detail)
else:
print("No write type specified.")
def write_to_file(self):
if self.write_type != 'excel':
return
columns = ["标题", "房厅", "面积", "朝向", "楼层", "年份", "小区", "位置", "总价(万)", "单价(元/平方米)", "图像id", "图像地址"]
keys = ["title", "house", "bedroom", "area","direction", "floor", "year", "location", "total_price", "unit_price", "pic_code", "pic_url"]
try:
print("start write to file...")
df = pd.DataFrame(self.data)
df.columns = columns
t = datetime.datetime.now()
writer = pd.ExcelWriter(path=self.path_name + '_%02d%02d%02d.xlsx' % (t.hour, t.minute, t.second), engine="xlsxwriter")
df.to_excel(writer, sheet_name='Sheet1', encoding='utf8', header=True, index=False, startcol=0, startrow=0)
writer.save()
print("Write a file to path name %s Successful." % self.path_name)
except Exception as e:
print("Fail to write file to path name: %s, Case: %s" % (self.path_name, e))
def write_to_mysql(self, detail):
table = "lian_jia_data"
sql = '''select id from {} where pic_code=%s'''.format(table)
result = self.mc.execute(sql, detail["pic_code"])
if result:
print("this data has exist! the pic code is ", detail["pic_code"])
return
self.mc.insert(table, detail)
print("insert to mysql successful! data is ", detail)
if __name__ == '__main__':
db_config = {
"host": "127.0.0.1",
"port": 3306,
"user": "root",
"password": "pwd",
"database": "data",
}
start = time.time()
home_link_spider = HomeLinkSpider(db_config=db_config)
home_link_spider.write_type = "excel"
home_link_spider.parse_page()
home_link_spider.write_to_file()
end = time.time()
print("耗时:{}秒".format(end - start))
数据库DDL
CREATE TABLE `lian_jia_data` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(100) DEFAULT NULL,
`house` varchar(100) DEFAULT NULL,
`bedroom` varchar(100) DEFAULT NULL,
`area` varchar(100) DEFAULT NULL,
`direction` varchar(100) DEFAULT NULL,
`floor` varchar(100) DEFAULT NULL,
`year` varchar(100) DEFAULT NULL,
`location` varchar(100) DEFAULT NULL,
`total_price` varchar(100) DEFAULT NULL,
`unit_price` varchar(100) DEFAULT NULL,
`pic_code` varchar(100) DEFAULT NULL,
`pic_url` varchar(100) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci