新建一个Scrapy文件:
# -*- coding: utf-8 -*-
import scrapy
class CsdnBlogSpider(scrapy.Spider):
name = 'csdn_blog'
allowed_domains = ['blog.csdn.net']
keyword = 'another'
def start_requests(self):
for pn in range(1, 11):
url = 'https://so.csdn.net/so/search/s.do?p=%s&q=%s&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0' % (
pn, self.keyword)
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
href_s = response.xpath('//div[@class="search-list-con"]/dl//span[@class="mr16"]/../../dt/div/a[1]/@href').extract()
for href in href_s:
yield scrapy.Request(
url=href,
callback=self.parse2
)
def parse2(self, response):
item = dict(
#获取第一个值等同于xpath('//h1[@class="title-article"]/text()')[0]
title = response.xpath('//h1[@class="title-article"]/text()').extract_first(),
#获取字节数据
data = response.body
)
yield item
# start_urls = ['http://blog.csdn.net/']
#
# def parse(self, response):
# pass
将setting.py文件下的这两个函数取消注释
DOWNLOADER_MIDDLEWARES = {
's1.middlewares.S1DownloaderMiddleware': 543,
}
ITEM_PIPELINES = {
's1.pipelines.S1Pipeline': 300,
}
在middlewares.py文件下修改DownloaderMiddleware类下的process_request函数:
#一般重写这里
def process_request(self, request, spider):
#这里是bytes类型,所以要导入Headers的包
request.headers = Headers(
{
'User_Agent': user_agent.get_user_agent_pc(),
}
)
#设置代理IP
request.meta['proxy'] ='http://'+ ur.urlopen('http://api.ip.data5u.com/dynamic/get.html?order=06b5d4a85d10b5cbe9db1e5a3b9fa2e1&sep=4').read().decode('utf-8').strip()
修改pipelines.py文件:
class S1Pipeline(object):
def process_item(self, item, spider):
with open('blog_html/%s.html' % item['title'], 'wb') as f:
f.write(item['data'])
return item
然后基本就OK,不想在控制台敲scrapy crawl 项目名的话,可以再spiders文件夹下添加一个start文件写入如下代码:
from scrapy import cmdline cmdline.execute('scrapy crawl 项目名'.split())
完成。
下面是非分布式爬取,作用和上面一样:
没有加入代理IP慎用,玩意被封了不太好。
import urllib.request as ur
import lxml.etree as le
import user_agent
import re
def getRequest(url):
return ur.Request(
url=url,
headers={
'User-Agent': user_agent.get_user_agent_pc(),
'Cookie': 'TY_SESSION_ID=14e93d1c-5cfb-4692-8416-dc2df061bb5c; JSESSIONID=68E9815DA238619AB37E640211691B8B; uuid_tt_dd=10_20594510460-1585746871024-545182; dc_session_id=10_1585746871024.456447; dc_sid=e127d0cf2db7a2e5cf7ded7b0b7d1880; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1585746876; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_20594510460-1585746871024-545182; c-toolbar-writeguide=1; announcement=%257B%2522isLogin%2522%253Afalse%252C%2522announcementUrl%2522%253A%2522https%253A%252F%252Fblog.csdn.net%252Fblogdevteam%252Farticle%252Fdetails%252F105203745%2522%252C%2522announcementCount%2522%253A1%252C%2522announcementExpire%2522%253A78705482%257D; firstDie=1; __guid=129686286.421372154518304900.1585746901554.712; monitor_count=1; c_ref=https%3A//blog.csdn.net/; __gads=ID=86722e1f5d97e31d:T=1585746904:S=ALNI_MaIZXWpb5EgzqK0TDZB-yNS9h6l_g; searchHistoryArray=%255B%2522python%2522%252C%2522opencv%2522%255D; dc_tos=q8426m; c-login-auto=3; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1585746960'
}
)
hot_word=['C++','java','python','PHP','Go','Objective-C','SQL','PL/SQL','C','Swift','Swift','Visual Basic',]
if __name__ == '__main__':
keyword = input('请输入关键词:')
pn_start = int(input('起始页:'))
pn_end = int(input('终止页:'))
for pn in range(pn_start, pn_end + 1):
request = getRequest(
'https://so.csdn.net/so/search/s.do?p=%s&q=%s&t=blog&viparticle=&domain=&o=&s=&u=&l=&f=&rbg=0' % (
pn, keyword)
)
response = ur.urlopen(request).read()
href_s = le.HTML(response).xpath('//div[@class="search-list-con"]/dl//span[@class="mr16"]/../../dt/div/a[1]/@href')
for href in href_s:
try:
response_blog = ur.urlopen(
getRequest(href)
).read()
title = le.HTML(response_blog).xpath('//h1[@class="title-article"]/text()')[0]
title = re.sub(
r'[\//:*<>|"?]', '', title
)
with open('blog/%s.html' % title, 'wb') as f:
f.write(response_blog)
print(title)
except Exception as e:
print(e)
# def getProxyOpener():
# # proxy_address = ur.urlopen(
# # 'http://api.ip.data5u.com/dynamic/get.html?order=d314e5e5e19b0dfd19762f98308114ba&sep=4').read().decode(
# # 'utf-8').strip()
# proxy_handler = ur.ProxyHandler(
# {
# 'http': '58.218.214.147:4029'
# }
# )
# return ur.build_opener(proxy_handler)
# print(response)
# print(href_s)