直接上代码
- spiders代码:
import scrapy
import lxml.html
from scrapy import Request
import re
from CsDn.items import CsdnItem
number = 1
class ExampleSpider(scrapy.Spider):
name = 'csdn'
def start_requests(self):
for i in range(1):
url_str = 'https://so.csdn.net/so/search/s.do?q=python&t=blog&o=&s=&l='
yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':'1'})
def parse(self, response):
contents = response.xpath('//dl[@class="search-list J_search"]').extract()
for i in contents:
cont = lxml.html.fromstring(i)
if cont.xpath('.//dd[@class="author-time"]/a//text()'):
url = cont.xpath('.//dd[@class="search-link"]/a/@href')
author = cont.xpath('.//dd[@class="author-time"]/a//text()')
title = ''.join(cont.xpath('.//dt/a[1]//text()'))
post_info = cont.xpath('.//dd[@class="author-time"]/text()')
comments = 0
post_time = self.get_post_time(post_info)
count_views = self.get_count_views(post_info)
item = CsdnItem()
item['url'] = ''.join(url)
item['author'] = ''.join(author)
item['post_time']= post_time
item['count_views'] = count_views
item['comments'] = comments
item['title'] = title
yield item
global number
number += 1
yield Request(url=response.url, callback=self.parse, dont_filter=True, meta={'page':'2','number':number})
def get_count_views(self,post_infoo):
post_info = ''.join(post_infoo)
count_views = re.search('浏览(.+)次',post_info).group(1)
count_view = re.search('[0-9]+',count_views).group()
return int(count_view)
def get_post_time(self,post_infoo):
post_info = ''.join(post_infoo)
post_time = re.search('日期:(.+) ',post_info).group(1)
return post_time
- 中间件
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FOptions
import time
from scrapy.http import HtmlResponse
class SeleniumMiddlewares(object):
def __init__(self):
self.options = FOptions()
#self.options.add_argument("-headless")
self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options)
def process_request(self,request,spider):
if int(request.meta['page']) == 2:
self.browser.get(request.url)
time.sleep(3)
self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(2)
input = self.browser.find_element_by_xpath('//span/input')
input.clear()
input.send_keys(request.meta['number'])
btn = self.browser.find_element_by_xpath('//span[@class="page-go"]/button')
btn.click()
time.sleep(3)
if int(request.meta['page']) == 1:
self.browser.get(request.url)
time.sleep(2)
return HtmlResponse(url=self.browser.current_url,body=self.browser.page_source,encoding="utf-8",request=request)
- pipelines
import pymysql
class CsdnPipeline(object):
def open_spider(self, spider):
# 连接数据库
# self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123456', db='movie', charset='utf8')
# 将配置文件读到内存中,是一个字典
host = '47.75.81.75' # 远程主机端口号
port = 3306 #端口号不带引号哟
user = 'user_name'
password = 'password'
dbname = 'database_name'
dbcharset = 'utf8'
self.conn = pymysql.Connect(host=host, port=port, user=user, password=password, db=dbname, charset=dbcharset)
def process_item(self, item, spider):
# 写入数据库中,blogs_csdn 是数据表名字
sql = 'insert into blogs_csdn(url, author, post_time, count_views,comments,title) values("%s", "%s", "%s", "%s", "%s", "%s")' % (
item['url'], item['author'], item['post_time'], item['count_views'], item['comments'], item['title'])
# 执行sql语句
self.cursor = self.conn.cursor()
try:
self.cursor.execute(sql)
print('#' * 10 + '保存成功啦')
self.conn.commit()
except Exception as e:
print('*' * 10 + '下载出错啦')
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
- items
import scrapy
class CsdnItem(scrapy.Item):
url = scrapy.Field()
author = scrapy.Field()
post_time = scrapy.Field()
count_views = scrapy.Field()
comments = scrapy.Field()
title = scrapy.Field()
- 存储结果如图: