码上行动2.1

最新推荐文章于 2018-11-17 20:41:03 发布

慢慢慢时光

最新推荐文章于 2018-11-17 20:41:03 发布

阅读量431

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/wjl31802/article/details/84142627

版权

爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

像读文献一样，读好的代码

豆瓣电影

spider.py

import requests
from lxml import etree

# 1. 将目标网站的页面抓取下来
headers = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                  " Chrome/69.0.3497.100 Safari/537.36",
    'Referer': "https://movie.douban.com/explore"
}
url = 'https://movie.douban.com/cinema/nowplaying/chengdu/'
response = requests.get(url, headers=headers)
# print(response.text)
html = response.text


# 2. 将抓取的数据进行提取
html = etree.HTML(html)
# ul有两个，正在热映和即将上映的，取第一个
ul = html.xpath("//ul[@class='lists']")[0]
# print(etree.tostring(ul,encoding='utf-8').decode("utf-8"))
# 当前ul 下所有的Li
lis = ul.xpath("./li")
movies = []
for li in lis:
    # print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
    title = li.xpath("@data-title")[0]
    score = li.xpath("@data-score")[0]
    duration = li.xpath("@data-duration")[0]
    region = li.xpath("@data-region")[0]
    director = li.xpath("@data-director")[0]
    actor = li.xpath("@data-actors")
    poster = li.xpath(".//img/@src")[0]
    movie = {
        'title': title,
        'score': score,
        'duration': duration,
        'region': region,
        'director': director,
        'actor': actor,
        'thumbnail': poster
    }
    movies.append(movie)
print(movies)

README.md

# 爬取豆瓣电影,练习xpath

## url：https://movie.douban.com/cinema/nowplaying/chengdu/
### 注意：  
* response.text返回的是一个经过解码后的字符串，是str(unicode)类型
* response.content返回的是一个原生的字符串，是从网页上抓取下来的，没有经过处理的字符串，
是bytes类型
* xpath规则：  
    + ./ 当前节点获取所有的子节点
    + @属性
    + .//当前节点所有节点
    + .//img/@src img标签里的src属性
* thumbnail 缩略图

电影天堂

from lxml import etree
import requests

BASE_DOMAIN = 'http://www.ygdy8.net'

HEADERS = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      " Chrome/69.0.3497.100 Safari/537.36",
    }

def get_detail_urls(url):

    response = requests.get(url, headers=HEADERS)
    # print(response.text)
    # print(response.content.decode('gbk'))
    html = response.text

    html = etree.HTML(html)
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    # for detail_url in detail_urls:
    #     print(BASE_DOMAIN+detail_url)
    detail_urls = map(lambda  url:BASE_DOMAIN+url, detail_urls)
    return detail_urls

def parse_detail_page(url):
    movie = {}
    response = requests.get(url,headers=HEADERS)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    movie['title'] = title

    zoomE = html.xpath("//div[@id='Zoom']")[0]
    imgs = zoomE.xpath(".//img/@src")
    if imgs is list:
        cover = imgs[0]
        screenshot = imgs[1]
        movie['cover'] = cover
        movie['screenshot'] = screenshot
    else:
        movie['cover'] = imgs

    def parse_infor(infor, rule):
        return infor.replace(rule,"").strip()

    infors = zoomE.xpath(".//text()")
    for index,infor in enumerate(infors):
        # print(infor)
        # print(index)
        # print("="*30)

        if infor.startswith("◎年　　代"):
            infor = parse_infor(infor,"◎年　　代")
            movie['year'] = infor
        elif infor.startswith("◎产　　地"):
            infor = parse_infor(infor, "◎产　　地")
            movie['country'] = infor
        elif infor.startswith("◎类　　别"):
            infor = parse_infor(infor, "◎类　　别")
            movie['category'] = infor
        elif infor.startswith("◎语　　言"):
            infor = parse_infor(infor, "◎语　　言")
            movie['language'] = infor
        elif infor.startswith("◎IMDb评分"):
            infor = parse_infor(infor, "◎IMDb评分")
            movie['imdb_rating'] = infor
        elif infor.startswith("◎片　　长"):
            infor = parse_infor(infor, "◎片　　长")
            movie['duration'] = infor
        elif infor.startswith("◎导　　演"):
            infor = parse_infor(infor, "◎导　　演")
            movie['director'] = infor
        elif infor.startswith("◎主　　演"):
            infor = parse_infor(infor, "◎主　　演")
            actors = [infor]
            for x in range(index+1, len(infors)):
                actor = infors[x].strip()
                if actor.startswith("◎简　　介"):
                    break
                actors.append(actor)
            movie['actors'] = actors
        elif infor.startswith("◎简　　介"):
            infor = parse_infor(infor, "◎简　　介")
            # movie['introduction'] = infor
            for x in range(index+1,len(infors)):
                profile = infors[x].strip()
                if profile.startswith("【下载地址】"):
                    break
                movie['profile'] = profile
    dowmload_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    movie['download_url'] = dowmload_url
    return movie

def spider():
    # {}占个位置，format填充
    base_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
    movies = []
    for x in range(1,8):
        # 第一个for循环控制总共有7页
        url = base_url.format(x)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            # 第二个for循环遍历一页中所有电影详情url
            movie = parse_detail_page(detail_url)
            movies.append(movie)
            print(movie)

    # print(movies)
if __name__ == '__main__':
    spider()

中国天气网

import requests
from bs4 import BeautifulSoup
from pyecharts import Bar

ALL_DATA = []


def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    text = response.content.decode('utf-8')
    soup = BeautifulSoup(text, 'html5lib')
    conMidtab = soup.find('div', class_='conMidtab')
    tables = conMidtab.find_all('table')
    for table in tables:
        trs = table.find_all('tr')[2:]
        for index, tr in enumerate(trs):
            tds = tr.find_all('td')
            city_td = tds[0]
            if index == 0:
                city_td = tds[1]
            city = list(city_td.stripped_strings)[0]
            temp_td = tds[-2]
            min_temp = list(temp_td.stripped_strings)[0]
            ALL_DATA.append({"city": city, "min_temp": int(min_temp)})
            # print({"city":city,"min_temp":min_temp})


def main():
    # url = 'http://www.weather.com.cn/textFC/hb.shtml#'
    # url = 'http://www.weather.com.cn/textFC/db.shtml'
    # url = 'http://www.weather.com.cn/textFC/hd.shtml'
    # url = 'http://www.weather.com.cn/textFC/gat.shtml'
    urls = [
        'http://www.weather.com.cn/textFC/hb.shtml',
        'http://www.weather.com.cn/textFC/db.shtml',
        'http://www.weather.com.cn/textFC/hd.shtml',
        'http://www.weather.com.cn/textFC/hz.shtml',
        'http://www.weather.com.cn/textFC/hn.shtml',
        'http://www.weather.com.cn/textFC/xb.shtml',
        'http://www.weather.com.cn/textFC/xn.shtml',
        'http://www.weather.com.cn/textFC/gat.shtml'
    ]
    for url in urls:
        parse_page(url)

    # 分析数据
    # 根据最低气温进行排序
    ALL_DATA.sort(key=lambda data: data['min_temp'])

    data = ALL_DATA[0:10]
    cities = list(map(lambda x: x['city'], data))
    temps = list(map(lambda x: x['min_temp'], data))
    chart = Bar("中国天气最低温排行榜")
    chart.add('', cities, temps)
    chart.render('temperature.html')


if __name__ == '__main__':
    main()
    # ALL_DATA = [
    #     {"city": "北京", 'min_temp': '-8'},
    #     {"city": "天津", 'min_temp': '-9'}
    # ]
    #
    # def sort_key(data):
    #     min_temp = data['min_temp']
    #     return min_temp

古诗文网

import re
import requests


def parse_page(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      " Chrome/69.0.3497.100 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    text = response.text
    titles = re.findall(r'<div\sclass="cont".*?<b>(.*?)</b>', text, re.S)
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.S)
    authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.S)
    content_tags = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.S)
    contents = []
    for content in content_tags:
        x = re.sub(r'<.*?>', '', content)
        contents.append(x.strip())

    poems = []
    for value in zip(titles, dynasties, authors, contents):
        title, dynasty, author, content = value
        poem = {
            'title': title,
            'dynasty': dynasty,
            'author': author,
            'content': content
        }
        poems.append(poem)

    for poem in poems:
        print(poem)
        print("=" * 40)


def main():
    # url = 'https://www.gushiwen.org/default_1.aspx'
    # parse_page(url)
    for x in range(1, 11):
        url = 'https://www.gushiwen.org/default_%s.aspx' % x
        parse_page(url)


if __name__ == '__main__':
    main()

线程进程协程应用

asynic

import asyncio
import threading

@asyncio.coroutine
def hello():
    print("Hello world! (%s)"% threading.currentThread())
    # 异步调用asyncio.sleep(1):
    r = yield from asyncio.sleep(1)
    print("Hello again! (%s)"% threading.currentThread())

# 获取eventLoop:
loop = asyncio.get_event_loop()
tasks = [hello(), hello()]
# 执行coroutine
loop.run_until_complete(asyncio.wait(tasks))
loop.close()

coroutine.py

def consumer():
    r = ''
    while True:
        n = yield r
        if not n:
            print('执行return')
            return
        print('[CONSUMER] Consuming %s...' % n)
        r = '200 OK'


def produce(c):
    c.send(None)
    n = 0
    while n < 5:
        n = n + 1
        print('[PRODUCER] Producing %s...' % n)
        r = c.send(n)
        print('[PRODUCER]Consumer return: %s' % r)
    c.close()

多线程thread

import time
import threading

# # # 采用传统方式
# def coding():
#     for x in range(3):
#         print('正在写代码%s'%x)
#         time.sleep(1)
#
# def drawing():
#     for x in range(3):
#         print('正在写画图%s'%x)
#         time.sleep(1)
#
# def main():
#     coding()
#     drawing()
#
# if __name__ == '__main__':
#     main()


# 采用多线程方式
def coding():
    for x in range(3):
        print('正在写代码%s'%threading.current_thread())
        time.sleep(1)

def drawing():
    for x in range(3):
        print('正在写画图%s'%threading.current_thread())
        time.sleep(1)

def main():
    t1 = threading.Thread(target=coding)
    t2 = threading.Thread(target=drawing)

    t1.start()
    t2.start()

    print(threading.enumerate())

if __name__ == '__main__':
    main()

多线程demo2

import threading
import time

class CodingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写代码%s' % threading.current_thread())
            time.sleep(1)

class DrawingThread(threading.Thread):
    def run(self):
        for x in range(3):
            print('正在写画图%s' % threading.current_thread())
            time.sleep(1)

def main():
    t1 = CodingThread()
    t2 = DrawingThread()

    t1.start()
    t2.start()

if __name__ == '__main__':
    main()

进程锁

import threading

# VALUE = 0
#
# gLock = threading.Lock()
#
# def add_value():
#     global VALUE
#     for x in range(1000000):
#         VALUE += 1
#     print('value: %d'%VALUE)
#
#
# def main():
#     for x in range(2):
#         t = threading.Thread(target=add_value)
#         t.start()
#
# if __name__ == '__main__':
#     main()

VALUE = 0

gLock = threading.Lock()

def add_value():
    global VALUE
    gLock.acquire()
    for x in range(1000000):
        VALUE += 1
    gLock.release()
    print('value: %d'%VALUE)


def main():
    for x in range(2):
        t = threading.Thread(target=add_value)
        t.start()

if __name__ == '__main__':
    main()

生产者-消费者模式

import threading
import random
import time

gMoney = 1000
gCondition = threading.Condition()
gTotalTimes = 10
gTimes = 0

class Producer(threading.Thread):
    def run(self):
        global gMoney
        global gTimes
        while True:
            money = random.randint(100,1000)
            gCondition.acquire()
            if gTimes >= gTotalTimes:
                gCondition.release()
                break
            gMoney += money
            print('%s生产了%d元钱，剩余%d元钱'%(threading.current_thread(),money,gMoney))
            gTimes += 1
            gCondition.notify_all()
            gCondition.release()
            time.sleep(0.5)

class Consumer(threading.Thread):
    def run(self):
        global gMoney
        while True:
            money = random.randint(100,1000)
            gCondition.acquire()
            while gMoney < money:
                if gTimes >= gTotalTimes:
                    gCondition.release()
                    return
                print("%s消费者准备消费%d元钱，剩余%d元钱，不足！"%(threading.current_thread(),money,gMoney))
                gCondition.wait()
            gMoney -= money
            print('%s消费者消费了%d元钱，剩余%d元钱'%(threading.current_thread(),money,gMoney))
            gCondition.release()
            time.sleep(0.5)


def main():
    for x in range(3):
        t = Consumer(name="消费者线程%d" % x)
        t.start()

    for x in range(4):
        t = Producer(name="生产者线程%d"%x)
        t.start()



if __name__ == '__main__':
    main()

队列

import time
from queue import Queue
import threading


# q.put(1)
# q.put(2)
#
# print(q.qsize(),q.empty(),q.full())

# for x in range(4):
#     q.put(x)
#
# for x in range(4):
#     print(q.get())

def set_value(q):
    index = 0
    while True:
        q.put(index)
        index += 1
        time.sleep(3)

def get_value(q):
    while True:
        print(q.get())

def main():
    q = Queue(4)
    # args将参数传给set_value函数
    t1 = threading.Thread(target=set_value, args=[q])
    t2 = threading.Thread(target=get_value, args=[q])

    t1.start()
    t2.start()

if __name__ == '__main__':
    main()

斗图

import os
import re

import requests
from lxml import etree
from urllib import request

def parse_page(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      " Chrome/69.0.3497.100 Safari/537.36"
    }
    response = requests.get(url,headers=headers)
    text = response.text
    html = etree.HTML(text)
    imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
    for img in imgs:
        img_url = img.get('data-original')
        alt = img.get('alt')
        alt = re.sub(r'[\?？\.!！。，]','',alt)
        suffix = os.path.splitext(img_url)[1]
        filename = alt + suffix
        request.urlretrieve(img_url,'images/' + filename)



def main():
    for x in range(1,101):
        url = 'https://www.doutula.com/photo/list/?page=%d'%x
        parse_page(url)
        # break

if __name__ == '__main__':
    main()

斗图协程

import os
import re
from queue import Queue
import requests
from lxml import etree
from urllib import request
import threading

class Producer(threading.Thread):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
                      " Chrome/69.0.3497.100 Safari/537.36"
    }
    # *args,**kwargs代表任意参数，父类的，page_queue,img_queue是自己的
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        # super()确保父类正常初始化
        super(Producer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)

    def parse_page(self, url):
        response = requests.get(url, headers=self.headers)
        text = response.text
        html = etree.HTML(text)
        imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
        for img in imgs:
            img_url = img.get('data-original')
            alt = img.get('alt')
            alt = re.sub(r'[\?？\.!！。，]', '', alt)
            suffix = os.path.splitext(img_url)[1]
            filename = alt + suffix
            # 抓取元祖
            self.img_queue.put((img_url, filename))


class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    def run(self):
        # 分别解剖
        while True:
            if self.img_queue.empty() and self.page_queue.empty():
                break
            img_url,filename = self.img_queue.get()
            request.urlretrieve(img_url, 'images/' + filename)
            print(filename + "已下载完成")


def main():
    page_queue = Queue(100)
    img_queue = Queue(1000)
    for x in range(1,20):
        url = 'https://www.doutula.com/photo/list/?page=%d'%x
        page_queue.put(url)

    for x in range(5):
        t = Producer(page_queue,img_queue)
        t.start()

    for x in range(5):
        t = Consumer(page_queue, img_queue)
        t.start()


if __name__ == '__main__':
    main()

README.md

# threading

## 比较采用传统方式和多线程方式

## 注意：
1. 查看总共多少线程 print(threading.enumerate())
demo1的结果 [<_MainThread(MainThread, started 8008)>, <Thread(Thread-1, started 18320)>, <Thread(Thread-2, started 6132)>]
2. 查看当前线程名字
3. 封装为类，更加独立
4. 多线程的缺陷，修改全局变量时，执行多次可能线程间代码会冲突，demo3，同时给VALUE+1，需要加锁
5. Lock生产者消费者模式，demo4，如何跳出循环，gTotalTimes=10 gTimes=0生产者消费者条件
6. threading.Condition可以再没有数据的时候处于阻塞等待状态,wait(),ntify_all()通知机制，更加高效，省内存
7. queue 线程安全，原子性

## 应用：斗图吧获取表情包

### url：https://www.doutula.com/photo/list/?page=%d
### 注意：
1. 框架： 先写main函数，里面写url，再写parse函数
2. demo8 为异步，多线程，结构：生产者消费者模式
   每一页的url→生产者（获取表情url）→每个表情url→消费者（下载表情）

scrapy

糗事百科

start.py

from scrapy import cmdline

# cmdline.execute("scrapy crawl qsbk_spider".split())
cmdline.execute(["scrapy",'crawl','qsbk_spider'])

qsbk_spider.py

# -*- coding: utf-8 -*-
import scrapy
# from scrapy.selector.unified import SelectorList
from ..items import QsbkItem


class QsbkSpiderSpider(scrapy.Spider):
    name = 'qsbk_spider'
    allowed_domains = ['qiushibaike.com']
    start_urls = ['https://www.qiushibaike.com/text/page/1/']
    base_domain = "https://www.qiushibaike.com"

    def parse(self, response):
        # selectorlist
        duanzidivs = response.xpath('//div[@id="content-left"]/div')
        for duanzidiv in duanzidivs:
            # selector
            author = duanzidiv.xpath('.//h2/text()').get().strip()
            content = duanzidiv.xpath('.//div[@class="content"]//text()').getall()
            content = ''.join(content).strip()
            item = QsbkItem(author=author, content=content)
            yield item
        next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').get()
        next_url = response.urljoin(next_url)
        yield scrapy.Request(next_url, callback=self.parse)
        if not next_url:
            return
        # print(next_url)
        # if not next_url:
        #     return
        # else:
        #     yield scrapy.Request(next_url,callback=self.parse)

items.py

import scrapy


class QsbkItem(scrapy.Item):
    collection = table = 'duanzi'
    author = scrapy.Field()
    content = scrapy.Field()

pipelines.py

from scrapy.exporters import JsonLinesItemExporter

class QsbkPipeline(object):
    def __init__(self):
        self.fp = open('duanzi.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')

    def open_item(self):
        print("保存开始")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_item(self):
        self.fp.close()
        print("爬虫结束了")




import pymongo

class MongoPipeline(object):
    def __init__(self,mongo_url,mongo_db):
        self.mongo_url = mongo_url
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_url=crawler.settings.get('MONGO_URL'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]

    def process_item(self,item,spider):
        self.db[item.collection].insert(dict(item))
        return item

    def close_spider(self,spider):
        self.client.close()

宝马5系

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl bmw5".split())

spiders

bmw5.py

# -*- coding: utf-8 -*-
import scrapy
from ..items import BmwItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class Bmw5Spider(CrawlSpider):
    name = 'bmw5'
    allowed_domains = ['car.autohome.com.cn']
    start_urls = ['https://car.autohome.com.cn/pic/series/65.html']

    rules = (
        Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+'),callback="parse_page", follow=True),
    )

    def parse_page(self, response):
        category = response.xpath('//div[@class="uibox"]//text()').get()
        srcs = response.xpath('//div[@class="uibox"]//ul/li/a/img/@src').getall()
        srcs = list(map(lambda x:response.urljoin(x.replace('t_','')),srcs))
        yield BmwItem(category=category,image_urls=srcs)


    def test_parse(self,response):
        uiboxs = response.xpath('//div[@class="uibox"]')[1:]
        for uibox in uiboxs:
            category = uibox.xpath('./div[@class="uibox-title"]/a/text()').get()
            urls = uibox.xpath('.//ul/li/a/img/@src').getall()
            urls = list(map(lambda url: response.urljoin(url), urls))
            item = BmwItem(category=category, image_urls=urls)
            yield item

items.py

import scrapy


class BmwItem(scrapy.Item):
    category = scrapy.Field()
    image_urls = scrapy.Field()
    iamges = scrapy.Field()

pipelines.py

import os
from urllib import request
from scrapy.pipelines.images import ImagesPipeline
from . import settings

class BmwPipeline(object):
    def __init__(self):
        self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
        if not os.path.exists(self.path):
            os.mkdir(self.path)

    def process_item(self, item, spider):
        category = item['category']
        urls = item['urls']

        category_path = os.path.join(self.path,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        for url in urls:
            image_name = url.split('_')[-1]
            request.urlretrieve(url,os.path.join(category_path,image_name))
        return item

class BMWImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        # 这个方法本身就是去发送请求的
        request_objs = super(BMWImagePipeline,self).get_media_requests(item,info)
        for request_obj in request_objs:
            request_obj.item = item
        return request_objs

    def file_path(self, request, response=None, info=None):
        path = super(BMWImagePipeline, self).file_path(request,response,info)
        category = request.item.get('category')
        images_store = settings.IMAGES_STORE
        category_path = os.path.join(images_store,category)
        if not os.path.exists(category_path):
            os.mkdir(category_path)
        image_name = path.replace("full/","")
        image_path = os.path.join(category_path,image_name)
        return image_path

settings.py

# 图片下载的路径，供Images pipelines使用
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')

简书

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl js".split())

spiders

js.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import ArticleItem
from ..middlewares import SeleniumDownloadMiddleware

class JsSpider(CrawlSpider):
    name = 'js'
    allowed_domains = ['jianshu.com']
    start_urls = ['https://www.jianshu.com/']

    rules = (
        Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),callback='', follow=True),
    )

    def parse_detail(self, response):

        print(response.text)
        if self.start_urls:
            pass
        else:
            title = response.xpath('//div[@class="article"]/h1/text()').get()
            avatar = response.xpath('//div[@class="article"]//a[@class="avatar"]/img/@src').get()
            author = response.xpath('//div[@class="author"]//span[@class="name"]/a/text()').get()
            pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace("*","")
            url = response.url.split("?")[0]
            article_id = url.split('/')[-1]
            content = response.xpath('//div[@class="show-content"]').get()
            word_count = int(response.xpath('//span[@class="wordage"]/text()').get().replace("字数 ",""))
            comment_count = response.xpath('//span[@class="comments-count"]/text()').get().replace("评论 ","")
            like_count = response.xpath('//span[@class="likes-count"]/text()').get().replace("喜欢 ","")
            read_count = response.xpath('//span[@class="views-count"]/text()').get().replace("阅读 ","")

            subjects = ','.join(response.xpath('//div[@class="include-collection"]/a/div/text()').getall())

            item = ArticleItem(
                title=title,
                avatar=avatar,
                author=author,
                article_id=article_id,
                origin_url=response.url,
                content = content,
                pub_time=pub_time,
                subjects=subjects,
                word_count=word_count,
                comment_count=comment_count,
                read_count=read_count,
                like_count=like_count,
            )
            yield item

items.py

import scrapy

class ArticleItem(scrapy.Item):
    collection = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
    article_id = scrapy.Field()
    origin_url = scrapy.Field()
    author = scrapy.Field()
    avatar = scrapy.Field()
    read_count = scrapy.Field()
    subjects = scrapy.Field()
    like_count = scrapy.Field()
    word_count = scrapy.Field()
    pub_time = scrapy.Field()
    comment_count = scrapy.Field(

pipelines.py

import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors

class JianshuSpiderPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': 'Wangjinliang_45',
            'database': 'jianshu',
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**dbparams)
        self.cursor = self.conn.cursor()
        self._sql = None

    def process_item(self, item, spider):
        self.cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],item['subjects'],item['pub_time'],item['origin_url'],item['article_id'],item['read_count'],item['like_count'],item['word_count'],item['comment_count']))
        self.conn.commit()
        return item

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
            insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id,read_count,subjects,like_count,word_count,comment_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            """
            return self._sql
        return self._sql


class JianshuTwistedPipeline(object):
    def __init__(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,
            'user': 'root',
            'password': 'Wangjinliang_45',
            'database': 'jianshu',
            'charset': 'utf8',
            'cursorclass': cursors.DictCursor
        }
        self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
        self._sql = None

    @property
    def sql(self):
        if not self._sql:
            self._sql = """
                insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id,subjects,read_count,like_count,word_count,comment_count) values(NULL,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """
            return self._sql
        return self._sql


    def process_item(self,item,spider):
        defer = self.dbpool.runInteraction(self.insert_item,item)
        defer.addErrback(self.handle_error,item,spider)

    def insert_item(self,cursor,item):
        cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],item['pub_time'],item['origin_url'],item['article_id'],item['subjects'],item['read_count'],item['like_count'],item['word_count'],item['comment_count']))

    def handle_error(self,error,item,spider):
        print('='*10+'error'+'='*10)
        print(error)
        print('='*10+'error'+'='*10)

搜房网

start.py

from scrapy import cmdline

cmdline.execute("scrapy crawl sfw".split())

sfw.py

# -*- coding: utf-8 -*-
import re

import requests
import scrapy
from ..items import NewhouseItem,ESFItem

class SfwSpider(scrapy.Spider):
    name = 'sfw'
    allowed_domains = ['fang.com']
    start_urls = ['http://www.fang.com/SoufunFamily.htm']

    def parse(self, response):
        trs = response.xpath('//div[@class="letterSelt"]//div[@id="c02"]//tr')
        province = None
        for tr in trs:
            tds = tr.xpath('./td[not(@class)]')
            province_td = tds[0]
            province_text = province_td.xpath('.//text()').get()
            province_text = re.sub(r'\s','',province_text)
            if province_text:
                province = province_text
            if province == '其它':
                continue

            city_td = tds[1]
            city_links = city_td.xpath('./a')
            for city_link in city_links:
                city = city_link.xpath('./text()').get()
                city_url = city_link.xpath('./@href').get()
                if 'bj.' in city_url:
                    city_newhouse_url = 'http://newhouse.fang.com/house/s/'
                    city_esf_url = 'http://esf.fang.com/'
                else:
                    city_newhouse_url = re.sub("fang.com","newhouse.fang.com/house/s",city_url)
                    city_esf_url = re.sub("fang.com","esf.fang.com",city_url)
                    # if city_newhouse_url in ['http://qianxinan.newhouse.fang.com/house/s/','http://esf.changji.newhouse.fang.com/house/s/',
                    #                                          'http://wj.esf.newhouse.fang.com/house/s/',]:
                    #     return
                    # if city_newhouse_url or city_esf_url in ['http://esf.changji.esf.fang.com/','http://leizhou.esf.fang.com/',
                    #                                          'http://yongkang.esf.fang.com/','http://zjg.esf.fang.com/',
                    #                                          'http://qianxinan.newhouse.fang.com/house/s/','http://esf.changji.newhouse.fang.com/house/s/',
                    #                                          'http://wj.esf.newhouse.fang.com/house/s/','http://yiwu.esf.fang.com/', 'http://kaili.esf.fang.com/',
                    #                                          'http://zjg.esf.fang.com/',]:
                    #     return
                # print(province,city)
                # print(city_esf_url,city_newhouse_url)
                yield scrapy.Request(url=city_newhouse_url,callback=self.parse_newhouse,meta={'info':(province,city)})
                yield scrapy.Request(url=city_esf_url,callback=self.parse_esf,meta={'info':(province,city),})
            #     break
            # break



    def parse_newhouse(self,response):
        province,city = response.meta.get('info')
        lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
        for li in lis:
            name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
            if name:
                name = name.strip()
            else:
                continue

            house_style = li.xpath('.//div[contains(@class,"house_type")]/a/text()').getall()
            # house_style = list(filter(lambda x:x.endswith("居"),house_style))
            area = ''.join(li.xpath('.//div[contains(@class,"house_type")]/text()').getall())
            area = re.sub('－|/	|\s','',area)
            address = li.xpath('.//div[@class="address"]/a/@title').get()
            district = li.xpath('.//div[@class="address"]/a/span/text()').get()
            district = re.sub('\s|\[|\]','',str(district))
            # print(district)
            sale = ''.join(li.xpath('.//div[contains(@class,"fangyuan")]/span/text()').getall())
            price = ''.join(li.xpath('.//div[@class="nhouse_price"]//text()').getall()).strip()
            origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
            item = NewhouseItem(name=name,house_style=house_style,area=area,address=address,district=district,sale=sale,price=price,origin_url=origin_url,province=province,city=city)
            yield item

        next_url = response.xpath('.//div[@class="page"]//li/a[@class="next"]/@href').get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={'info':(province,city)})



    def parse_esf(self,response):
        province,city = response.meta.get('info')
        # if "北京" in city:
        #     response = requests.get('http://esf.fang.com/')

        dls = response.selector.xpath('//div[contains(@class,"shop_list")]/dl')
        for dl in dls:
            name = dl.xpath('.//dd/h4/a/@title').get()
            if name:
                name = name.strip()
            else:
                continue

            infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
            infos = list(map(lambda x:re.sub('\s','',x),infos))
            for info in infos:
                if info.endswith("厅"):
                   house_style  = info
                elif info.endswith("㎡"):
                   house_area  = info
                elif "层" in info:
                    house_floor = info
                elif "向" in info:
                    house_direction = info
                elif "年" in info:
                    house_year = info.replace('建','')

            house_append = dl.xpath('.//p[contains(@class,"label")]/span/text()').get()
            if not house_append:
                continue
            address = ''.join(dl.xpath('.//p[@class="add_shop"]//span/text()').getall())
            title = ''.join(dl.xpath('.//p[@class="add_shop"]/a/@title').getall())
            price = ''.join(dl.xpath('.//dd[@class="price_right"]/span[position()=1]//text()').getall())
            unit = ''.join(dl.xpath('.//dd[@class="price_right"]/span[position()=2]//text()').getall())
            origin_url = dl.xpath('.//dd/h4/a/@href').get()
            origin_url = response.urljoin(origin_url)
            item = ESFItem(name=name,house_style=house_style,house_area=house_area,house_year=house_year,house_append=house_append,house_floor=house_floor,house_direction=house_direction,address=address,price=price,unit=unit,origin_url=origin_url,title=title,province=province,city=city)
            yield item

        next_url = response.xpath('//div[@class="page_al"]/p[position()=1]/a/@href').get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={'info':(province,city)})

pipelines.py

from scrapy.exporters import JsonLinesItemExporter
from .items import NewhouseItem,ESFItem
import pymongo

class FangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json','wb')
        self.esf_fp = open('esf.json','wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False)
        self.esf_exporter = JsonLinesItemExporter(self.esf_fp,ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item,NewhouseItem):
            self.newhouse_exporter.export_item(item)
        elif isinstance(item,ESFItem):
            self.esf_exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.newhouse_fp.close()
        self.esf_fp.close()


class MongoPipeline(object):
    def __init__(self,mongo_url,mongo_db):
        self.mongo_url = mongo_url
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_url=crawler.settings.get('MONGO_URL'),mongo_db=crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_url)
        self.db = self.client[self.mongo_db]
        self.db[NewhouseItem.collection].create_index([('id',pymongo.ASCENDING)])
        self.db[ESFItem.collection].create_index([('id',pymongo.ASCENDING)])

    def close_spider(self,spider):
        self.client.close()

    def process_item(self,item,spider):
        if isinstance(item,NewhouseItem) or isinstance(item,ESFItem):
            self.db[item.collection].insert(dict(item))
        return item

middlewares.py

import random

class UserAgentDownloadMiddleware(object):
    USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
        'Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)',
        'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
        'Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
    ]
    def process_request(self,request,spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

class ProxyMiddleware():
    def __init__(self,proxy_url):
        self.logger = logging.getLogger(__name__)
        self.proxy_url = proxy_url

    def get_random_proxy(self):
        try:
            response = requests.get(self.proxy_url)
            if response.status_code == 200:
                proxy = response.text
                return proxy
        except requests.ConnectionError:
            return False

    def process_request(self,request,spider):
        n = 0
        while request.meta.get('retry_times'):
            proxy = self.get_random_proxy()
            # if proxy:
            uri = 'https://{proxy}'.format(proxy=proxy)
            self.logger.debug('使用代理 '+ proxy)
            n+=1
            request.meta['proxy'] = uri
            if n==3:
                break
            elif ConnectionLost:
                break

    @classmethod
    def from_crawler(cls,crawler):
        settings = crawler.settings
        return cls(
            proxy_url=settings.get('PROXY_URL')
        )

顶部直达电梯

慢慢慢时光

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
码上行动2.1

像读文献一样，读好的代码文章目录豆瓣电影spider.pyREADME.md电影天堂中国天气网古诗文网线程进程协程应用asyniccoroutine.py多线程thread多线程demo2进程锁生产者-消费者模式队列斗图斗图协程README.mdscrapy糗事百科start.pyqsbk_spider.pyitems.pypipelines.py宝马5系start.pyspidersb...
复制链接

扫一扫