因为得买房子,所以爬一下。vip可见是因为不想沾染因果,稳一下。
(文章大概流程是找框架->改成爬详情->加代理ip->加多线程->done)
首先,网上找一个示例。随便什么都可以,重要的得到一个框架。
然后运行成功了,类似这样
import requests, json, time
from bs4 import BeautifulSoup
import re, csv
import requests
import parsel
import time
import csv
def parse_one_page(url):
f = open('西安二手房信息.csv', mode='a', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f, fieldnames=['标题', '开发商', '房子信息', '发布周期', '售价/万', '单价'])
csv_writer.writeheader()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
response = requests.get(url=url, headers=headers)
selector = parsel.Selector(response.text)
lis = selector.css('.sellListContent li')
dit = {}
for li in lis:
title = li.css('.title a::text').get()
dit['标题'] = title
positionInfo = li.css('.positionInfo a::text').getall()
info = '-'.join(positionInfo)
dit['开发商'] = info
houseInfo = li.css('.houseInfo::text').get()
dit['房子信息'] = houseInfo
followInfo = li.css('.followInfo::text').get()
dit['发布周期'] = followInfo
Price = li.css('.totalPrice span::text').get()
dit['售价/万'] = Price
unitPrice = li.css('.unitPrice span::text').get()
dit['单价'] = unitPrice
csv_writer.writerow(dit)
# print(dit)
def write_to_file(content):
# 参数newline保证输出到csv后没有空行
with open('data.csv', 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
# writer.writerow(['Region', 'Garden', 'Layout', 'Area', 'Direction', 'Renovation', 'Elevator', 'Price', 'Year', 'PerPrice'])
writer.writerow(content)
def main(offset):
# regions = ['jiangbei', 'yubei', 'nanan', 'banan', 'shapingba', 'jiulongpo', 'yuzhong', 'dadukou', 'jiangjing',
# 'fuling',
# 'wanzhou', 'hechuang', 'bishan', 'changshou1', 'tongliang', 'beibei']
regions = ['yanta', 'beilin', 'weiyang', 'baqiao', 'xinchengqu', 'lintong', 'yanliang', 'changan', 'lianhu']
# regions = ['yanta']
for region in regions:
for i in range(1, offset):
# url = 'https://cq.lianjia.com/ershoufang/' + region + '/pg' + str(i) + '/'
url = 'https://xa.lianjia.com/ershoufang/' + region + '/pg' + str(i) + '/'
html = parse_one_page(url)
if(i%10==0):
print(region, i)
# time.sleep(1)
# print('{} has been writen.'.format(region))
main(101)
但是,问题来了,这个只是列表页,想要详情页,得自己写。于是,成了这个样子
import requests, json, time
from bs4 import BeautifulSoup
import re, csv
import requests
import parsel
import time
import csv
def main(offset):
hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
{'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
{'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
{'User-Agent