为了分析深圳市所有长租、短租公寓的信息,爬取了某租房公寓网站上深圳区域所有在租公寓信息,以下记录了爬取过程以及爬取过程中遇到的问题:
爬取代码:
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import pymongo
from config import *
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL) # 申明连接对象
db = client[MONGO_DB] # 申明数据库
def get_one_page_html(url): # 获取网站每一页的html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/85.0.4183.121 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return None
except RequestE