爬取http://www.yododo.com/share/guide/?sort=isGood
资源下载: https://download.csdn.net/download/tuboshushaobatu/11705937
获取评论,评论图片
import scrapy
from day24.spiderproject.spiderproject.items import SpiderprojectItem
import time
from day24.homework import Downloader
import uuid
import os
class TourismspiderSpider(scrapy.Spider):
name = 'tourismspider'
# allowed_domains = ['']
# 访问的网址
# start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=']
start_urls = []
def __init__(self, start_urls=None, *args, **kwargs):
super(TourismspiderSpider, self).__init__(*args, **kwargs)
self.start_urls.append(start_urls)
pass
def parse(self, response):
touris = response.xpath("//div[@class='jdlist gl']/ul[@class='clearfix']/li") # 返回的是xpath的选择器列表
nextURL = response.xpath('//div[@class="pageNavi"]') # 取下一页的地址
currentResult = nextURL.xpath('a')[-1].xpath('@href').extract()[0].strip()
# 处理下一页
# 遍历选择器列表
count = 0
for currentPalace in touris:
count += 1
sItem = SpiderprojectItem()
sItem['nextURL'] = currentResult
position = currentPalace.xpath("a/strong")
if currentPalace:
con = currentPalace.xpath("a/span")
try:
Zan = con[1].xpath("span")
zan = Zan[0].xpath("text()").extract()[0].strip()
zan = zan.split("个")[0]
sItem['zanCount'] = int(zan)
sItem['title'] = position.extract()[0].strip().split(">")[1].split("<")[0]
a = currentPalace.xpath("a/@href").extract()[0]
except Exception as e:
print('error at 39')
print(e)
print(a)
yield scrapy.Request(url=a, callback=self.parse_detail, meta={'item': sItem, 'tourisLen': len(touris), 'tourisCount': count, 'nextUrl': currentResult}, dont_filter=True)
pass
pass
# 定义爬取详情页的方法
def parse_detail(self, response):
sItem = response.meta['item']
tourisLen = response.meta['tourisLen']
tourisCount = response.meta['tourisCount']
detailData = response.xpath("//div[@id='article-content']")
spots = response.xpath("//div[@class='crumbs w1200']/a")[2:-1:1]
pp = detailData.xpath("p")
spot = [i.xpath("text()").extract()[0].strip() for i in spots]
nextUrl = response.meta['nextUrl']
p = ''
for i in spot:
p = p + os.sep + i
sItem['spotName'] = p.replace(os.sep, '>').lstrip('>')
path_name = "d:\\spider\\travel" + p + os.sep + sItem['title']
if not os.path.exists(path_name):
os.makedirs(path_name) # 如果指定的路径不存在,新建
os.chdir(path_name)
for o in pp:
if o.xpath("img"):
url = o.xpath("img/@src").extract()[0].strip()
d = Downloader(url, str(uuid.uuid4()) + '.' + url.split(".")[-1])
d.start()
time.sleep(0.05)
path = os.getcwd()
sItem['imgpath'] = path_name
print(spot)
if detailData:
sItem['detailContent'] = detailData.extract()[0].strip()
yield sItem # 保顺序
# 判断当前页是否爬取完成了,完成就继续爬取下一页
if tourisLen == tourisCount:
if nextUrl:
yield scrapy.Request(nextUrl, self.parse, dont_filter=False)
pass
pass
pass
利用matplotlib画图,画出点赞数和中国城市之间的关系的柱状图
from day24.spiderproject.spiderproject.dao.basedao import BaseDao
from day24.spiderproject.spiderproject.dao.drawer import Drawer
import os
class countData(BaseDao):
def __init__(self):
super().__init__()
pass
def Zan(self,param):
return self.querySpot("select title,zanCount,detailPhontUrl from tourism_mainpage t1,tourism_detail t2 where t1.mainId=t2.mainId and t2.detailPhonturl like %s order by zanCount desc",param)
pass
def bokecount(self,param):
return self.querySpot("select count(*) from (select title,zanCount,detailPhontUrl from tourism_mainpage t1,tourism_detail t2 where t1.mainId=t2.mainId and t2.detailPhonturl like %s order by zanCount desc) t;",param)
def Zancount(self, param):
return self.querySpot(
"select sum(zancount) from (select title,zanCount,detailPhontUrl from tourism_mainpage t1,tourism_detail t2 where t1.mainId=t2.mainId and t2.detailPhonturl like '' order by zanCount desc) t;",
param)
def comment(self, param):
return self.querySpot(
"select detailContent from tourism_detail where detailPhonturl like %s",
param)
def create(self, sql, params):
try:
result = self.execute(sql, params)
print(result)
self.commit()
lastRowId = self.getLastRowId()
return result, lastRowId
except Exception as e:
print(e)
def querySpot(self, sql, params):
try:
result = self.query(sql, params)
return result
except Exception as e:
print(e)
pass
def getCurrentCity(filePath):
filesList = []
for root, dirs, files in os.walk(r'D:\spider\travel\亚洲\中国'):
filesList = []
for dir in dirs:
filesList.append(dir)
break
return filesList
if __name__ == '__main__':
c=countData()
filesList = getCurrentCity('D:\spider\travel\亚洲\中国')
print(filesList) # 城市名称
resultList = []
for temp in filesList:
resultList.append(c.Zan('%'+temp+'%')) # 包括文章名,赞的个数,url的list
print(resultList)
resultZanList = []
for temp in resultList:
zan = 0
for currentItem in temp:
zan += currentItem[1]
resultZanList.append(zan)
zan = 0
print(resultZanList) # 获取每个城市的点赞的个数
d1 = Drawer(filesList, resultZanList)
d1.drawerIt()
mostPopCity = filesList[resultZanList.index(max(resultZanList))]
print(mostPopCity)
import numpy as np
import matplotlib.pyplot as plt
class Drawer:
def __init__(self, xList, yList):
self.xList = xList
self.yList = yList
pass
def drawerIt(self):
# 绘制柱状图
plt.rcParams['font.sans-serif'] = ['SimHei']
xlabel = self.xList
x = self.xList
y = self.yList
bars = plt.bar(x, y, width=0.6)
# 背景线
plt.grid(linestyle='--')
ax = plt.subplot()
ax.set_ylabel('salary/month')
ax.set_xlabel('position')
# 设置刻度值
ax.set_xticks(x)
# 设置显示值
ax.set_xticklabels(xlabel)
# # 设置颜色
# i = 0
# for bar in bars:
# bar.set_color('#' + str(111111 + i))
# i += 444444
for x, y in zip(x, y):
plt.text(x, y + 0.05, '{0}'.format(np.float(y)), ha='center', va='bottom')
plt.show()