爬虫实战练习


前言

利用正则表达式进行信息提取和文本分析,完成百度新闻、搜狗新闻、新浪财经的数据挖掘。

一、获取多家公司的百度新闻并生成数据报告

#批量获取多家公司的百度新闻
from urllib import request,parse
import random
import time
from fake_useragent import UserAgent
import re


class BaiduNewsSpider(object):
    def __init__(self):
        self.url = 'https://www.baidu.com/s?tn=news7rtt=1&bsst=1&cl=2&wd={}'
        #添加计数
        self.i = 0

    # 请求
    def get_html(self,url):
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        html = res.read().decode()
        # 获取完之后直接调用解析函数
        return html

    # 解析
    def parse_html(self,html):
        # re_bds = '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
        # pattern = re.compile(re_bds,re.S)
        # r_list = pattern.findall(html)
        # self.write_html(r_list)
        pass


    # 保存
    def write_html(self,filename,html):
        with open(filename,'w') as f:
            f.write(html)
            self.i += 1


    # 主函数
    def run(self):
        companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
        for i in companys:
            try:
                wd = parse.quote(i)
                url = self.url.format(wd)
                html = self.get_html(url)
                filename = '{}"百度新闻".html'.format(i)
                self.write_html(filename, html)
                # 随机休眠
                time.sleep(random.uniform(1,2))
                print(i + '百度新闻爬取成功')
            except:
                print(i + '百度新闻爬取失败')
        print('数量:', self.i)

if __name__=='__main__':
    start = time.time()
    spider = BaiduNewsSpider()
    spider.run()
    end = time.time()

二、批量获取多家公司多页的百度新闻

1.批量获取数据

代码如下(示例):

#批量获取多家公司多页的百度新闻
from urllib import request,parse
import random
import time
import re


class BaiduNewsSpider(object):
    def __init__(self):
        self.url = 'https://www.baidu.com/s?tn=news7rtt=4&bsst=1&cl=2&wd={}&pn={}'
        # 按时间排序https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E4%B8%87%E7%A7%91%E9%9B%86%E5%9B%A2&medium=0
        #添加计数
        self.i = 0

    # 请求
    def get_html(self,url):
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
        req = request.Request(url=url, headers=headers)
        res = request.urlopen(req)
        html = res.read().decode()
        # 获取完之后直接调用解析函数
        return html

    # 解析
    def parse_html(self,html):
        # #r_list:[('月光宝盒''周星驰''1994'),(),()
        # re_bds = '<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?releasetime">(.*?)</p>'
        # pattern = re.compile(re_bds,re.S)
        # r_list = pattern.findall(html)
        # # 直接调用写入函数
        # self.write_html(r_list)
        pass


    # 保存
    def write_html(self,filename,html):
        with open(filename,'w') as f:
            f.write(html)
            self.i += 1


    # 主函数
    def run(self):
        companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
        for company in companys:
            wd = parse.quote(company)
            for i in range(1, 4):
                pn = (i - 1) * 10
                url = self.url.format(wd, pn)
                html = self.get_html(url)
                filename = '{}百度新闻-第{}页.html'.format(company, i)
                self.write_html(filename, html)
                print('第%d页抓取成功' % i)

                # 每爬取1个页面随机休眠1-3秒
                time.sleep(random.randint(1, 3))
            print(company + '百度新闻爬取成功')
        print('数量:', self.i)

if __name__=='__main__':
    start = time.time()
    spider = BaiduNewsSpider()
    spider.run()
    end = time.time()

2.数据清洗

代码如下(示例):

import requests
import re
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
url = 'https://www.sogou.com/sogou?query=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4&interation=1728053249&pid=sogou-wsse-9fc36fa768a74fa9&ie=utf8&w=&sut=6046&sst0=1612509795700&lkt=1%2C1612509795594%2C1612509795594'
# timeout访问超时设置,访问超过指定时间就报出异常,访问结束
res = requests.get(url, headers=headers,timeout=10).text

# 正则表达式获取数据
re_bds = '<h3 class="vr-title">.*?<a id="(.*?)".*?href="(.*?)">(.*?)</a>.*?<div class="text-layout">.*?<span>.*?</span><span>(.*?)</span>'
pattern = re.compile(re_bds, re.S)
r_list = pattern.findall(res)

# 数据清洗

item = {}
for r in r_list:
    item['id'] = re.sub('<.*?>', '', r[0].strip())
    item['网址'] = re.sub('<.*?>', '', r[1].strip())
    item['标题'] = re.sub('<.*?>', '', r[2].strip())
    item['发布时间'] = re.sub('<.*?>', '', r[3].strip())
    print(item)
    print('*' * 50)

3.将1.2.合并

代码如下(示例):

#批量获取多家公司多页的百度新闻并清洗数据
from urllib import request,parse
import requests
import random
import time
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}

def Sougou(company):
    url = 'https://www.sogou.com/sogou?ie=utf8&p=40230447&interation=1728053249&interV=&pid=sogou-wsse-8f646834ef1adefa&query=' + company + 'shid=djt1'
    url = url.format(parse.quote(company))
    res = requests.get(url, headers=headers, timeout=10).text
    #print(res)
    
    # 编写正则表达式提取数据
    p_title = '<h3 class="vr-title">.*?<a id=".*?".*?href=".*?">(.*?)</a>'
    title = re.compile(p_title, re.S).findall(res)
    p_href = '<h3 class="vr-title">.*?<a id=".*?".*?href="(.*?)">'
    href = re.compile(p_href, re.S).findall(res)
    p_date = '<h3 class="vr-title">.*?<span>.*?</span><span>(.*?)</span>'
    date = re.compile(p_date, re.S).findall(res)
    print(title,href,date)

    # 数据清洗及打印输出
    for i in range(len(title)):
        title[i] = re.sub('<.*?>','',title[i])
        title[i] = re.sub('&.*?;', '', title[i])
        date[i] = re.sub('<.*?>', '', date[i])
        print(str(i+1) + '.' + title[i] +'-' + date[i])
        print(href[i])

companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
for i in companys:
    Sougou(i)
    # 每爬取1个页面随机休眠1-3秒
    time.sleep(random.randint(1, 3))
    print(i + '百度新闻爬取成功')

三、批量获取多家公司的新浪财经新闻

#批量获取多家公司多页的新浪财经新闻并清洗数据---这里公司名称不能直接用中文
from urllib import request,parse
import requests
import random
import time
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}

def Sina(company):
    url = 'https://search.sina.com.cn/?q={}&c=news&from=channel'
    url = url.format(parse.quote(company))
    res = requests.get(url, headers=headers, timeout=10).text
    # print(res)


    # 编写正则表达式提取数据
    p_title = '<h2><a href=.*?target=".*?">(.*?)</a>'
    title = re.compile(p_title, re.S).findall(res)
    p_href = '<h2><a href="(.*?)">'
    href = re.compile(p_href, re.S).findall(res)
    p_date = '<h2><a href=.*?<span class="fgray_time">(.*?)</span></h2>'
    date = re.compile(p_date, re.S).findall(res)
    #print(title,href,date)

    # 数据清洗及打印输出
    for i in range(len(title)):
        title[i] = re.sub('<.*?>','',title[i])
        title[i] = re.sub('&.*?;', '', title[i])
        date[i] = re.sub('<.*?>', '', date[i])
        print(str(i+1) + '.' + title[i] +'-' + date[i])
        print(href[i])

companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
for i in companys:
    Sina(i)
    # 每爬取1个页面随机休眠1-3秒
    time.sleep(random.randint(1, 3))
    print(i + '新浪财经新闻爬取成功')

总结

以上是爬虫实战的内容,后续将继续学习python金融数据分析。
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值