#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import re
import requests
import datetime
from bs4 import BeautifulSoup
import os
import collections
import json
import sys
import time
reload(sys)
sys.setdefaultencoding('utf-8')
#获取当天的日期,并进行格式化,用于后面文件命名,格式:20200420
today = datetime.date.today().strftime('%Y%m%d')
def crawl_wiki_data(city):
"""
爬取百度百科中地区基础信息
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
url="""https://baike.baidu.com/item/{city}""".format(city=city)
polysemyURL ="""https://baike.baidu.com{href}"""
try:
time.sleep(1)
response = requests.get(url,headers=headers,timeout=(3,7))
count=0
while response.status_code != 200:
python爬虫爬取百度百科数据
最新推荐文章于 2024-07-27 12:20:46 发布