二级网络爬虫【入门级】+ matplotlib画图+数据库+源码

最新推荐文章于 2023-04-25 17:38:52 发布

tuboshuShaoBaTu

最新推荐文章于 2023-04-25 17:38:52 发布

阅读量252

点赞数 1

分类专栏：人工智能文章标签：入门爬虫

本文链接：https://blog.csdn.net/tuboshuShaoBaTu/article/details/100675871

版权

人工智能专栏收录该内容

4 篇文章 0 订阅

订阅专栏

爬取http://www.yododo.com/share/guide/?sort=isGood
资源下载： https://download.csdn.net/download/tuboshushaobatu/11705937
获取评论，评论图片


import scrapy
from day24.spiderproject.spiderproject.items import SpiderprojectItem
import time
from day24.homework import Downloader
import uuid
import os
class TourismspiderSpider(scrapy.Spider):
    name = 'tourismspider'
    # allowed_domains = ['']
    # 访问的网址
    # start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
    start_urls = []
    def __init__(self, start_urls=None, *args, **kwargs):
        super(TourismspiderSpider, self).__init__(*args, **kwargs)
        self.start_urls.append(start_urls)
        pass

    def parse(self, response):
        touris = response.xpath("//div[@class='jdlist gl']/ul[@class='clearfix']/li") # 返回的是xpath的选择器列表
        nextURL = response.xpath('//div[@class="pageNavi"]')  # 取下一页的地址
        currentResult = nextURL.xpath('a')[-1].xpath('@href').extract()[0].strip()
        # 处理下一页
        # 遍历选择器列表
        count = 0
        for currentPalace in touris:
            count += 1
            sItem = SpiderprojectItem()
            sItem['nextURL'] = currentResult
            position = currentPalace.xpath("a/strong")
            if currentPalace:
                con = currentPalace.xpath("a/span")
                try:
                    Zan = con[1].xpath("span")
                    zan = Zan[0].xpath("text()").extract()[0].strip()
                    zan = zan.split("个")[0]
                    sItem['zanCount'] = int(zan)
                    sItem['title'] = position.extract()[0].strip().split(">")[1].split("<")[0]
                    a = currentPalace.xpath("a/@href").extract()[0]
                except Exception as e:
                    print('error at 39')
                    print(e)
            print(a)
            yield scrapy.Request(url=a, callback=self.parse_detail, meta={'item': sItem, 'tourisLen': len(touris), 'tourisCount': count, 'nextUrl': currentResult}, dont_filter=True)


            pass
        pass


    # 定义爬取详情页的方法
    def parse_detail(self, response):
        sItem = response.meta['item']
        tourisLen = response.meta['tourisLen']
        tourisCount = response.meta['tourisCount']
        detailData = response.xpath("//div[@id='article-content']")
        spots = response.xpath("//div[@class='crumbs w1200']/a")[2:-1:1]
        pp = detailData.xpath("p")
        spot = [i.xpath("text()").extract()[0].strip() for i in spots]

        nextUrl = response.meta['nextUrl']
        p = ''
        for i in spot:
            p = p + os.sep + i
        sItem['spotName'] = p.replace(os.sep, '>').lstrip('>')
        path_name =  "d:\\spider\\travel" + p + os.sep + sItem['title']
        if not os.path.exists(path_name):
            os.makedirs(path_name)  # 如果指定的路径不存在，新建
        os.chdir(path_name)
        for o in pp:
            if o.xpath("img"):
                url = o.xpath("img/@src").extract()[0].strip()
                d = Downloader(url, str(uuid.uuid4()) + '.' + url.split(".")[-1])
                d.start()
                time.sleep(0.05)
        path = os.getcwd()
        sItem['imgpath'] = path_name
        print(spot)

        if detailData:

            sItem['detailContent'] = detailData.extract()[0].strip()
            yield sItem # 保顺序

        # 判断当前页是否爬取完成了，完成就继续爬取下一页
        if tourisLen == tourisCount:
            if nextUrl:
                yield scrapy.Request(nextUrl, self.parse, dont_filter=False)
            pass
        pass

    pass

利用matplotlib画图，画出点赞数和中国城市之间的关系的柱状图

from day24.spiderproject.spiderproject.dao.basedao import BaseDao
from day24.spiderproject.spiderproject.dao.drawer import Drawer

import os
class countData(BaseDao):
    def __init__(self):
        super().__init__()
        pass

    def Zan(self,param):
        return self.querySpot("select title,zanCount,detailPhontUrl from tourism_mainpage t1,tourism_detail t2 where t1.mainId=t2.mainId and t2.detailPhonturl like %s order by zanCount desc",param)
        pass
    def bokecount(self,param):
        return  self.querySpot("select count(*) from (select title,zanCount,detailPhontUrl from tourism_mainpage t1,tourism_detail t2 where t1.mainId=t2.mainId and t2.detailPhonturl like %s order by zanCount desc) t;",param)

    def Zancount(self, param):
        return self.querySpot(
            "select sum(zancount) from (select title,zanCount,detailPhontUrl from tourism_mainpage t1,tourism_detail t2 where t1.mainId=t2.mainId and t2.detailPhonturl like '' order by zanCount desc) t;",
            param)
    def comment(self, param):
        return self.querySpot(
            "select  detailContent from tourism_detail where detailPhonturl like %s",
            param)
    def create(self, sql, params):
        try:
            result = self.execute(sql, params)
            print(result)
            self.commit()
            lastRowId = self.getLastRowId()
            return result, lastRowId
        except Exception as e:
            print(e)

    def querySpot(self, sql, params):
        try:
            result = self.query(sql, params)
            return result
        except Exception as e:
            print(e)
    pass

def getCurrentCity(filePath):
    filesList = []
    for root, dirs, files in os.walk(r'D:\spider\travel\亚洲\中国'):
        filesList = []
        for dir in dirs:
            filesList.append(dir)
        break
    return filesList


if __name__ == '__main__':
    c=countData()
    filesList = getCurrentCity('D:\spider\travel\亚洲\中国')
    print(filesList)    # 城市名称
    resultList = []
    for temp in filesList:
        resultList.append(c.Zan('%'+temp+'%'))    # 包括文章名，赞的个数，url的list
    print(resultList)

    resultZanList = []
    for temp in resultList:
        zan = 0
        for currentItem in temp:
            zan += currentItem[1]
        resultZanList.append(zan)
        zan = 0
    print(resultZanList)        # 获取每个城市的点赞的个数

    d1 = Drawer(filesList, resultZanList)
    d1.drawerIt()
    mostPopCity = filesList[resultZanList.index(max(resultZanList))]
    print(mostPopCity)

import numpy as np
import matplotlib.pyplot as plt
class Drawer:
    def __init__(self, xList, yList):
        self.xList = xList
        self.yList = yList
        pass
    def drawerIt(self):
        # 绘制柱状图
        plt.rcParams['font.sans-serif'] = ['SimHei']
        xlabel = self.xList
        x = self.xList
        y = self.yList
        bars = plt.bar(x, y, width=0.6)
        # 背景线
        plt.grid(linestyle='--')
        ax = plt.subplot()
        ax.set_ylabel('salary/month')
        ax.set_xlabel('position')
        # 设置刻度值
        ax.set_xticks(x)
        # 设置显示值
        ax.set_xticklabels(xlabel)

        # # 设置颜色
        # i = 0
        # for bar in bars:
        #     bar.set_color('#' + str(111111 + i))
        #     i += 444444

        for x, y in zip(x, y):
            plt.text(x, y + 0.05, '{0}'.format(np.float(y)), ha='center', va='bottom')
        plt.show()

在这里插入图片描述

tuboshuShaoBaTu

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
二级网络爬虫【入门级】+ matplotlib画图+数据库+源码

爬取http://www.yododo.com/share/guide/?sort=isGood获取评论，评论图片import scrapyfrom day24.spiderproject.spiderproject.items import SpiderprojectItemimport timefrom day24.homework import Downloaderimport...
复制链接

扫一扫