爬取的网站
想要学区房,先看看你的孩子想要去哪个学校。如果不知道哪个学校好,想要所有的学校,那么我这个爬虫脚本可以给你帮忙。
爬取的代码
# coding:utf-8
import datetime
import json
import os
import re
import time
from copy import copy
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys
from xlutils.copy import copy
class sxueSpider():
url = 'http://xuexiao.51sxue.com/slist'
page = 1
areaCodeS = 4401,
params = {
"t": 3,
"areaCodeS": areaCodeS,
"page": page,
}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
"Cookie": "sxue-visiter=656c8f51d735a9a30cd86b3a7589ec96; td_cookie=4030308922; __cfduid=d041d88e59886888e0d638c3fa0642e4b1585292336; UM_distinctid=1711ac96d21864-0cf7a7f25f27ea-6555782d-144000-1711ac96d2252d; sxue-visiter=318fc1b73d7b78e6e7e11ce7420aa6b9; sxueSID=1585292586; CNZZDATA1000463183=1169820516-1585289207-null%7C1585289207",
"Host": "xuexiao.51sxue.com",
"Upgrade-Insecure-Requests": 1,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36"
}
def get_page(self, url, headers, params):
try:
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
# print(response.content.decode('gbk','ignore'))
html = response.content.decode('gbk', 'ignore').replace(u'\xa9', u'')
# print(html)
return html
else:
print(response.status_code)
except requests.ConnectionError:
return None
def parse_page(self, html):
time.sleep(2)
all_result = []
for

该博客分享了如何使用爬虫从51搜学校网站抓取广州市的中小学信息,旨在帮助家长了解学校详情,同时也指出爬取的数据可用于房地产投资分析。
最低0.47元/天 解锁文章
2253

被折叠的 条评论
为什么被折叠?



