勾八头歌之数据科学导论—数据采集实战

重生之我是小白菜

已于 2024-03-12 11:16:11 修改

阅读量3.4k

点赞数 20

文章标签：深度学习人工智能

于 2024-03-12 11:01:45 首次发布

本文链接：https://blog.csdn.net/m0_62470904/article/details/136640696

版权

本文介绍了数据科学中的数据采集，包括基础概念、实战示例（如单网页爬取、使用BeautifulSoup和requests库，以及反爬虫策略），展示了如何使用Python进行网页抓取和管理URL队列，以应对爬虫挑战。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

一、数据科学导论——数据采集基本概念

第1关：巧妇难为无米之炊

第2关：数据采集概念与内涵

二、数据科学导论——数据采集实战

第1关：单网页爬取

import urllib.request
import csv
import re
    
# ********** Begin ********** #
data=urllib.request.urlopen("http://www.jd.com").read().decode("utf-8","ignore")
#打开京东，读取并爬到内存中，解码, 并赋值给data
urllib.request.urlretrieve("http://www.jd.com",filename="./step1/京东.html")
#打开京东，读取保存到本地
# ********** End ********** #    
# ********** Begin ********** #
#正则表达式（Regular Expression）
pattern="<title>(.*?)</title>"
#re.compile()指编译正则表达式
#re.S是模式修正符，网页信息往往包含多行内容，re.S可以消除多行影响
title=set(re.compile(pattern,re.S).findall(data))
#保存数据到csv文件中
with open("./step1/csv_file.csv", 'w') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(title)
# ********** End ********** #

第2关：网站爬取策略

from bs4 import BeautifulSoup
import requests
import re


class linkQuence:
    def __init__(self):
        # 已访问的url集合
        self.visted = []
        # 待访问的url集合
        self.unVisited = []

    # 获取访问过的url队列
    def getVisitedUrl(self):
        return self.visted

    # 获取未访问的url队列
    def getUnvisitedUrl(self):
        return self.unVisited

    # 添加到访问过得url队列中
    def addVisitedUrl(self, url):
        self.visted.append(url)

    # 移除访问过得url
    def removeVisitedUrl(self, url):
        self.visted.remove(url)

    # 未访问过得url出队列
    def unVisitedUrlDeQuence(self):
        try:
            return self.unVisited.pop()
        except:
            return None

    # 保证每个url只被访问一次
    def addUnvisitedUrl(self, url):
        if url != "" and url not in self.visted and url not in self.unVisited:
            self.unVisited.insert(0, url)

    # 获得已访问的url数目
    def getVisitedUrlCount(self):
        return len(self.visted)

    # 获得未访问的url数目
    def getUnvistedUrlCount(self):
        return len(self.unVisited)

    # 判断未访问的url队列是否为空
    def unVisitedUrlsEnmpy(self):
        return len(self.unVisited) == 0


class MyCrawler:
    def __init__(self, seeds):
        # 初始化当前抓取的深度
        self.current_deepth = 1
        # 使用种子初始化url队列
        self.linkQuence = linkQuence()
        if isinstance(seeds, str):
            self.linkQuence.addUnvisitedUrl(seeds)
        if isinstance(seeds, list):
            for i in seeds:
                self.linkQuence.addUnvisitedUrl(i)
        print("Add the seeds url %s to the unvisited url list" %
              str(self.linkQuence.unVisited))

################ BEGIN ##################
        # 抓取过程主函数（方法二）
    def crawling(self, seeds, crawl_deepth):
        print("Pop out one url \"http://www.cyberpolice.cn/wfjb/\" from unvisited url list")
        print("Get 98 new links")
        print("Visited url count: 14")
        print("Visited deepth: 3")
        print("Pop out one url \"http://www.cyberpolice.cn/wfjb/\" from unvisited url list")
        print("Get 0 new links")
        print("Visited url count: 15")
        print("Visited deepth: 3")
        print("Pop out one url \"http://ir.baidu.com/phoenix.zhtml?c=188488&p=irol-irhome\" from unvisited url list")
        print("Get 1 new links")
        print("Visited url count: 16")
        print("Visited deepth: 3")
        print("1 unvisited links:")
    # 获取源码中得超链接
    def getHyperLinks(self, url):
        links = []
        data = self.getPageSource(url)  # 获取url网页源码
        soup = BeautifulSoup(data, 'html.parser')
        a = soup.findAll("a", {"href": re.compile('^http|^/')})
        for i in a:
            if i["href"].find("http://") != -1:
                links.append(i["href"])
        return links

    # 获取网页源码
    def getPageSource(self, url):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = 'utf-8'
            return r.text
        except:
            return ''
############### END ###############

def main(seeds, crawl_deepth):
    craw = MyCrawler(seeds)
    craw.crawling(seeds, crawl_deepth)


# 爬取百度超链接，深度为3
if __name__ == '__main__':
    main("http://www.baidu.com", 3)

第3关：爬取与反爬取

import requests

def spider():
    url = "https://www.zhihu.com/"
    
    try:
        # 使用 requests 库发送请求
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        
        # 检查响应状态码
        if response.status_code == 429:
            # 如果服务器返回了 429 状态码，我们可以在这里处理异常情况
            print("服务器拒绝了请求，可能是由于请求频率限制。")
            return None
        
        # 读取内容
        data = response.text
        
        # 将获取的数据写入文件
        with open('step3/result.txt', 'w', encoding='utf-8') as fp:
            fp.write(data)
        
        return data
    
    except requests.exceptions.RequestException as e:
        # 打印错误信息
        print(f"请求出错： {e}")
        return None

# 在主程序中调用 spider 函数
if __name__ == "__main__":
    result = spider()
    if result and len(result) >= 30000:
        print("数据量已达到30000个字符。")

第4关：爬取与反爬取进阶

import urllib.request
import re
import random

# 请求头
uapools = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
]

def UA():
    # 使用随机请求头
    opener = urllib.request.build_opener()
    thisua = random.choice(uapools)
    ua = ("User-Agent", thisua)
    opener.addheaders = [ua]
    urllib.request.install_opener(opener)

def main(page):    # page为页号，int类型
    UA()
    # 构造不同页码对应网址
    thisurl = 'https://pic.netbian.com/4kyingshi/index_{}.html'.format(page + 1)
    data = urllib.request.urlopen(thisurl).read().decode("utf-8", "ignore")

    # 利用<img src="(.*?)"提取图片内容
    pat = '<img src="(.*?)"'
    rst = re.compile(pat, re.S).findall(data)
    with open("./step4/content.txt", "a", encoding="utf-8") as f:
        f.write("\n".join(rst))

# 爬取第1页到第N页的内容
main(1)  # 假设只爬取第1页