Talk is cheap, show me the Code.
ok,... 安排.
代码被托管在了Gitee(码云):
https://gitee.com/leviathan-litan/Discovery_Data_Web
代码如下:
# coding:utf-8
# Describe
"""
Author: Adamhuan
Blog: http://www.d-prototype.com
目标:爬取【网页数据】
"""
# Import
# 数据分析
import pandas as pd
# 系统
import os
# 时间
import datetime,time
# 正则表达式
import re
# 处理JSON数据
import json
# HTTP or HTTPS
from urllib import request,response
import requests
# 【WEB】编码解析器
from bs4 import BeautifulSoup
# Variable
# 目标URL
url_address = ""
# Class
class Dig_Data_Web:
# Class Attribute
# -- Object
obj_request = None
obj_response = None
obj_html = None
# -- Variable
headers = ""
string_url = ""
# -- Path
path_script_base = os.getcwd()
path_download_base = os.path.dirname(path_script_base) + "/download"
def __init__(self, str_url=""):
# Display / Intro
print("************************")
print("脚本:Web数据 - 爬取")
print("------------------")
print("当前路径:【" + self.path_script_base + "】")
print("下载路径:【" + self.path_download_base + "】")
print("************************")
# 目标URL
self.string_url = str_url
# 自定义【header】
# 防止【反爬虫】
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
# 初始化【爬取网页】所需要的对象
if str_url != "" and str_url != None:
# 响应返回的数据
self.obj_request = request.Request(url=self.string_url, headers=self.headers)
self.obj_response = request.urlopen(self.obj_request)
s