本来想正面刚一下这个验证码的,但是一直post不上去,只好设置随机延迟,防止反爬
fangdd.py
1 import requests 2 from lxml import etree 3 import re 4 from check_code import * #验证码处理模块 5 from get_pinyin import * #汉子转拼音模块 6 from save_to_mongo import * 7 import time 8 import random 9 10 11 class Fangdd(): 12 def __init__(self): 13 user_agent_list = [ 14 "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1", 15 "Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)", 16 "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11", 17 "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11", 18 "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)", 19 "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)" 20 ] 21 #产生一个随机user-agent 22 self.headers={ 23 #从上面的列表上随机取一个 24 "User-Agent":random.choice(user_agent_list) 25 } 26 # self.c = Code_text() 27 # self.rand_time = random.randint(1, 5) 28 29 def get_html(self, url): 30 # 请求网页源代码 31 response = requests.get(url, headers=self.headers) 32 html = response.text 33 element = etree.HTML(html) 34 # 根据title判断是否反爬 35 title = element.xpath('//title/text()') 36 title = title[0] 37 if title == 'chech captcha': 38 print('有内鬼,终止交易(爬虫已被发现,正在进行验证码处理)') 39 # self.c.post_code() 40 time.sleep(5) 41 else: 42 return html 43 44 45 def get_location(self, html): 46 # 得到全国各地地名 47 addresses = [] 48 element = etree.HTML(html) 49 lis = element.xpath('//div[@class="q3rm0"]/li[position()>1]//a/text()') 50 for li in lis: 51 # 得到的是汉字,而网址中是拼音,转换一下 52 li_pinyin = get_pinyin(li) 53 addresses.append(li_pinyin) 54 return addresses 55 56 57 def get_all_url(self, addresses): 58 # 对每一个地名的网址进行解析 59 urls = [] 60 for address in addresses: 61 addr_url = 'https://%s.fangdd.com/xiaoqu' % address 62 urls.append(addr_url) 63 return urls 64 65 66 def parse_list_page(self, urls): 67 not_found = [] #网址中的拼音不全是汉字转换的,无法统一抓取 68 for p_url in urls: 69 # 设置随机数进行睡眠,防止反爬 70 # time.sleep(self.rand_time) 71 html = self.get_html(p_url) 72 element = etree.HTML(html) 73 # 多音字转换报错,根据title内容来确定 74 title = element.xpath('//title/text()') 75 title = title[0] 76 if title == '很抱歉!您访问的页面不存在!': 77 # 因为有的地区名字过长,网址中采用省略写法,所以根据拼音拼接的网址与真实网址不对应,导致404 78 print('由于拼音转换与网址不对应,从列表中删除该网址:%s' % p_url) 79 else: 80 print(title) 81 # 当小区数量不足20时,只有一页,最大页数为1 82 max_xiaoqu = element.xpath('//p[@class="filter-result"]/b/text()') 83 max_xiaoqu = max_xiaoqu[0] 84 max_xiaoqu = int(max_xiaoqu) 85 86 if max_xiaoqu <= 20 and max_xiaoqu != 0 : 87 print('该地区只有一页') 88 print('max_xiaoqu=%s' % max_xiaoqu) 89 max_page = 1 90 self.get_informations(p_url, max_page) 91 92 elif max_xiaoqu > 20: 93 # 找到最大页数,来确定循环的边界 94 max_page = element.xpath('//div[@class="pagebox"]/a[position()<last()]/text()') 95 max_page = max_page[-1].strip() 96 max_page = int(max_page) 97 self.get_informations(p_url, max_page) 98 99 # max_xiaoqu == 0的情况 100 else: 101 print(p_url+'该地区没有小区') 102 103 104 def get_informations(self, g_url, max_page): 105 print('='*20+'正在爬取网址[%s]'%g_url + '='*30) 106 107 for pageNo in range(1, max_page+1): 108 time.sleep(random.randint(1, 3)) 109 pageNo_url = g_url+'/?pageNo=%s' % pageNo 110 print('='*10 + '正在爬取[%s]=======[%s]页' % (pageNo_url, pageNo) + '='*10 ) 111 112 response = requests.get(pageNo_url, headers=self.headers) 113 html = response.text 114 element = etree.HTML(html) 115 lis = element.xpath('//ul[@class="lp-list"]//li') 116 117 for li in lis: 118 information = {} 119 li_url = li.xpath('.//h3[@class="name"]/a/@href') 120 121 name = li.xpath('.//h3[@class="name"]/a/text()') 122 name = name[0] 123 124 address = li.xpath('.//div[@class="house-cont"]//p[@class="address"]/text()') 125 address = address[0] 126 address = re.sub(' ', '', address) 127 price = li.xpath('.//div[@class="fr"]/span/em/text()') 128 price = price[0].strip() 129 price = re.sub('元/㎡', '', price) 130 information = { 131 'url': li_url, 132 'name': name, 133 'address': address, 134 'price': price 135 } 136 save_to_mongo(information) 137 138 139 def main(): 140 f = Fangdd() 141 url = 'https://shanghai.fangdd.com/' 142 html = f.get_html(url) 143 address = f.get_location(html) 144 urls = f.get_all_url(address) 145 f.parse_list_page(urls) 146 147 148 if __name__ == '__main__': 149 main()
get_pinyin.py
1 from pypinyin import pinyin 2 3 4 def get_pinyin(text): 5 # style=0参数设置取消声调,详情http://pypinyin.mozillazg.com/zh_CN/v0.9.1/ 6 p = pinyin(text, style=0) 7 # [['chong'], ['qing']] 8 a = [] 9 for i in p: 10 a.append(i[0]) 11 # print(p) 12 b = ''.join(a) 13 return b
save_to_mongo.py
1 import pymongo 2 3 client = pymongo.MongoClient('127.0.0.1', port=27017) 4 db = client.fangdd_mark 5 collection = db.informations 6 7 def save_to_mongo(result): 8 try: 9 if collection.insert(result): 10 pass 11 # print('success save to mongodb') 12 except Exception: 13 print('error to mongo') 14
因为设置了延迟,再加上数据量比较大,所以爬取时间有点长,我打完了一把王者荣耀,c开头的还没爬完,此时数据库中已经有22000条信息了
运行结果: