Python scrapy框架爬虫demo

最新推荐文章于 2023-04-28 09:37:24 发布

小鹏程序

最新推荐文章于 2023-04-28 09:37:24 发布

阅读量258

点赞数

分类专栏： Python

本文链接：https://blog.csdn.net/qq_35979073/article/details/79354291

版权

Python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

encoding=utf8

import scrapy
import time
from scrapyLuntan.items import ScrapyluntanItem
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
lis = []
class LunTan(scrapy.Spider):
# 这个爬虫的识别名称，必须是唯一的，在不同的爬虫必须定义不同的名字
name='luntan'
# 是搜索的域名范围，也就是爬虫的约束区域，
# 规定爬虫只爬取这个域名下的网页，不存在的URL会被忽略。
allowd_domains = ['http://dzh2.mop.com/']
#爬取的URL元祖 / 列表。爬虫从这里开始抓取数据，
# 所以，第一次下载的数据将会从这些urls开始。其他子URL将会从这些起始URL中继承性生成。
start_urls = ('http://www.mop.com/',)
#解析的方法，每个初始URL完成下载后将被调用，
# 调用的时候传入从每一个URL传回的Response对象来作为唯一参数
def parse(self, response):
li_list = response.xpath("//li[@class='mop-item-a']")
print len(li_list)
for i in li_list:
a_href = i.xpath("./a/@href")[0].extract()
a_href = a_href.replace('http://dzh2.mop.com/dzh_index.html#rlink=','')
print a_href
# 发送新的url请求加入待爬队列，并调用回调函数 self.parse
yield scrapy.Request(a_href, meta={'a_href': a_href},callback=self.res_detail)

def res_detail(self, response):
    item = ScrapyluntanItem()
    detail_url = response.meta['a_href']
    print response.meta['a_href']
    if 'http://dzh2.mop.com/' in detail_url:
        text = response.xpath("//div[@class='post-date fl mr15']/span/text()")[0].extract()
    else:
        text = response.xpath("//div[@class='mr20 inlineBlock']/span/text()")[0].extract()

    text = text.strip()
    print text
    if '年' in text:
        print 88888
        text = text.replace('年','-')
        text = text.replace('月','-')
        text = text.replace('日','')

    timeArray = time.strptime(text,"%Y-%m-%d %H:%M:%S")
    item['time'] = int(time.mktime(timeArray))
    open('./time.txt','a+').write(str(item['time'])+"\r\n")
    yield item

    # print timeStamp

小鹏程序

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python scrapy框架爬虫demo

encoding=utf8import scrapyimport timefrom scrapyLuntan.items import ScrapyluntanItemimport sysreload(sys)sys.setdefaultencoding("utf-8")lis = []class LunTan(scrapy.Spider):# 这个爬虫的识别名称，必须是唯一的...
复制链接

扫一扫

专栏目录