1 #! /usr/bin/env python
2 #-*- coding:utf-8 -*-
3
4 '''
5 Created on 2019年11月24日6
7 @author: Admin8 '''
9
10 importrequests11 from lxml importetree12 importtime13 importcsv14
15 '''
16 方法名称:spider17 功能: 爬取目标网站,并以源码文本18 参数: url 目标网址19 '''
20
21
22 defspider(url):23 try:24 header ={25 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',26 'cookie':'TY_SESSION_ID=150d5f1d-3be9-47b7-8728-f5b0673e307d; lianjia_uuid=22c2fd7c-bd33-4b52-b13c-0455483c8c53; _smt_uid=5dda86a6.5152533e; UM_distinctid=16e9d9dfc04451-098d8fb5ad92f6-e353165-1fa400-16e9d9dfc05a07; _ga=GA1.2.1829982433.1574602409; digv_extends=%7B%22utmTrackId%22%3A%2221583074%22%7D; _jzqa=1.3521694123893513000.1574602407.1574773117.1575120474.3; _jzqc=1; _jzqckmp=1; _gid=GA1.2.1091277813.1575120477; CNZZDATA1255849584=948253718-1574601020-https%253A%252F%252Fwww.baidu.com%252F%7C1575116340; CNZZDATA1254525948=3090229-1574602304-https%253A%252F%252Fwww.baidu.com%252F%7C1575120323; _qzjc=1; CNZZDATA1255604082=2128363916-1574597104-https%253A%252F%252Fwww.baidu.com%252F%7C1575119427; lianjia_ssid=923a34dd-a281-4f27-8dd2-5acf42342745; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1574602407,1574773116,1575120687; _jzqy=1.1574602407.1575120687.3.jzqsr=baidu|jzqct=%E9%87%8D%E5%BA%86%E6%88%BF%E7%BD%91.jzqsr=baidu; select_city=500000; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216e9d9dfd30306-0d0ad150c956ec-e353165-2073600-16e9d9dfd319bc%22%2C%22%24device_id%22%3A%2216e9d9dfd30306-0d0ad150c956ec-e353165-2073600-16e9d9dfd319bc%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_utm_source%22%3A%22baidu%22%2C%22%24latest_utm_medium%22%3A%22pinzhuan%22%2C%22%24latest_utm_campaign%22%3A%22sousuo%22%2C%22%24latest_utm_content%22%3A%22biaotimiaoshu%22%2C%22%24latest_utm_term%22%3A%22biaoti%22%7D%7D; CNZZDATA1255633284=795194134-1574597808-https%253A%252F%252Fwww.baidu.com%252F%7C1575120759; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1575121032; _qzja=1.2113280281.1574602406885.1574773116776.1575120645619.1575120907050.1575121031859.0.0.0.46.3; _qzjb=1.1575120645619.11.0.0.0; _qzjto=11.1.0; _jzqb=1.15.10.1575120474.1; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiZTZiZTc4OGQ0MGZlZDJlZmZhNjRmMGYyNmJhNGM5NDBlZWZkMmM5N2Y0MzU2MmYyYzY2ZjQwMzZhNTk1MTI1M2ZiZjc2OGU3ODEzODBhZTYzMzNiOWZkZTExNTBkMmIxY2FhMzlmMzZmMTM5NGU0YmEwYmY5OTdlNDI5NmRiYTVjYzA5NmNkY2JkZmZkNWRmZmVhZWU1MDFjZjU0NTgyOTU0ZTkxZmVkZjhhYmI3ODc1YjJlNjA2Yzk3ZWRhNDJlYWUxZTBiZGJlMjBkNjQ2MWRkZDU3ZDRkOTE5ZTM0NDUwNDZjODNiZjE5ZGI3MzQ1MjU1YWFmNmRkZWJhZDJkZDNjMjk2MGFjNzIxNGY2YWY2Y2JkOWM5ZDcxYTU5N2FhMzMwMTJjNzNlNGEwNjhiOGI3MzEzMzIyYzM0NmVkM2ZcIixcImtleV9pZFwiOlwiMVwiLFwic2lnblwiOlwiNDNlZDc3OGJcIn0iLCJyIjoiaHR0cHM6Ly9jcS5saWFuamlhLmNvbS9jaGVuZ2ppYW8vIiwib3MiOiJ3ZWIiLCJ2IjoiMC4xIn0=',27 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',28 'upgrade-insecure-requests': '1',29 }30 response = requests.get(url=url, headers=header)31 #print(response.text)
32 returnresponse.text33 except:34 print('failed to spider the target site, please check if the url is correct or the connection is available!')35
36
37 '''
38 方法名称:spider_detail39 功能: 解析html源码,提取房屋参数40 参数: url 目标网址41 '''
42
43
44 defspider_detail(url):45 response_text =spider(url)46 sel =etree.HTML(response_text)47 for house_num in range(1, 31):48 try:49 house_info = sel.xpath('/html/body/div[5]/div[1]/ul/li[%d]/div/div[1]/a/text()'
50 % house_num)[0].strip().split(' ')51 house_name =house_info[0]52 house_mode = house_info[1]53 house_area = house_info[2].strip('平米')54
55 house_prim_money = sel.xpath('/html/body/div[5]/div[1]/ul/li[%d]/div/div[4]/span[2]/span[1]/text()'
56 %house_num)[0].strip()57 house_sale_time = sel.xpath('/html/body/div[5]/div[1]/ul/li[%d]/div/div[2]/div[2]/text()'
58 % house_num)[0].strip().split('.')[0]59 house_price = sel.xpath('/html/body/div[5]/div[1]/ul/li[%d]/div/div[3]/div[3]/span/text()'
60 % house_num)[0].strip().strip("单价").strip("元/平米")61 house_totle = sel.xpath('/html/body/div[5]/div[1]/ul/li[%d]/div/div[2]/div[3]/span/text()'
62 %house_num)[0].strip()63 house_url = sel.xpath('/html/body/div[5]/div[1]/ul/li[%d]/div/div[1]/a/@href'
64 %house_num)[0].strip()65 house_data =[house_name, house_area, house_mode, \66 house_sale_time, house_prim_money, house_price, house_totle, house_url]67 save_csv(house_data)68
69 exceptException as e:70 print(e)71 print("参数错误")72
73
74 '''
75 方法名称:save_csv76 功能: 将数据按行储存到csv文件中77 参数: house_data 获取到的房屋数据列表78 '''
79
80
81 defsave_csv(house_data):82
83 try:84 with open('E:/chongqing/cq_chengjiao_jiangbei_year.csv', 'a', encoding='utf-8-sig', newline='') as f:85 writer =csv.writer(f)86 writer.writerow(house_data)87 except:88 print('write csv error!')89
90
91 '''
92 方法名称:get_all_urls93 功能: 生成所有所有的url并存放到迭代器中94 参数: page_number 需要爬网页总数95 返回值: url 返回一个url的迭代96 '''
97
98
99 defget_all_urls(page_number):100 if (type(page_number) == type(1) and page_number > 0): #防止错误输入
101 for page in range(1, page_number + 1):102 url = 'https://cq.lianjia.com/chengjiao/jiangbei/pg'+str(page)+'l3a4a5/'
103 yieldurl104 else:105 print('page_number is incorrect!')106
107
108 #csv首列写入
109 save_csv(['house_name', 'house_area', 'house_mode', \110 'house_sale_time', 'house_prim_money', 'house_price', 'house_totle', 'house_url'])111
112 for url in get_all_urls(100):113 try:114 time.sleep(5)115 spider_detail(url)116 exceptException as e:117 print(e)118 print('An error has been occurred when spidering house-price of chongqing!')