python爬虫：编写多进程爬虫学习笔记

最新推荐文章于 2023-09-22 10:19:04 发布

山谷來客

最新推荐文章于 2023-09-22 10:19:04 发布

阅读量1.9k

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/u010035907/article/details/52917202

版权

python 专栏收录该内容

28 篇文章 1 订阅

订阅专栏

# -*- coding: utf-8 -*-
"""
Created on Sat Oct 22 21:01:23 2016

@author: hhxsym
"""

import requests
import json
import os
import pymongo
import time
from bs4 import BeautifulSoup
from multiprocessing import Pool  #进程调用的包

inpath="C:\\Users\\hhxsym\\Desktop\\课程群Python爬虫"
inpath = unicode(inpath , "utf8")
os.chdir(inpath)  #不做编码转换后，中文路径无法打开，更改

#连接数据库
client =pymongo.MongoClient('localhost', 27017) #连接数据库
sense = client['sense'] #创建数据库
url_list = sense['url_list'] #创建数据库表


def get_city_urls():
    url = 'http://www.senseluxury.com'
    with open('city.html') as f:   #本地读取
        response = f.read() #直接读取到文本
    soup = BeautifulSoup(response, 'lxml')
    urls = soup.select('#destination_nav > div > div > div > dl.dl-list > dt > a') #CSS结构类型，注意空格
    return [url.get("href") for url in urls]

def get_page_list(city, page=1):
    now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    url = 'http://www.senseluxury.com/destinations_list/%s' % city.split('/')[-1]
    payload = {'page':page, 'callback':'jsonp'}
    responses = requests.get(url,payload) #请求网页，获得响应的内容,requests.get(url地址，关键字url参数)
    #print responses.url
    print responses.status_code
    #print responses.text[6:-1] #打印json格式的“字符串” (1)
    wb_data = json.loads(responses.text[6:-1]) #将字符串转换为python的字典 (2)
    print type(responses.text), type(wb_data) #对比两种类型 (1)(2)对比
    #print json.dumps(wb_data, encoding='utf-8', ensure_ascii=False) #json.dumps方法，变换成中文打印

    #通过循环获取数据

    for i in wb_data['val']['data']:
        title = i['title']
        url = 'http://www.senseluxury.com'+i['url'] #数据拼接，获得我们想要的数据
        id = i['id']
        server=i['server'].replace('&nbsp;',' ').split()
        memo = i['memo']
        price = i['price']
        address = i['address']
        subject =i['subject']

        data = {'title':title, 'id':id, 'server':server,'memo':memo,
                'prie':price, 'address':address, 'subject':subject,
                'create_time':now}

        url_list.insert_one(data) #插入数据（字典）
        # 注：生成的数据列表中 _id是自动生成的
        #print title, url
        print data


if __name__=='__main__':
    #get_page_list(1)
    #print get_city_urls()
    #get_page_list('http://www.senseluxury.com/destinations/2', page=1)
    city_urls = get_city_urls()
    print city_urls
    pool = Pool(processes=4) #设置进程数量
    pool.map(get_page_list, city_urls) #pool.map(函数名称，迭代对象)
    pool.close()  # 等待进程池中所有进程执行结束之后再关闭
    pool.join()   #关闭之后要计入它，作用：防止主程序在子进程结束前关闭


# 网页json类型的查看：浏览器 -> 右键 检查 -> network -> XHR ->页面触发(跳转页面) -> name勾选-> Response ->查看是否出现json格式字符串
# http://jsoneditoronline.org/  在线格式化网站，查看接送嵌套格式

山谷來客

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
2
评论
python爬虫：编写多进程爬虫学习笔记

# -*- coding: utf-8 -*-"""Created on Sat Oct 22 21:01:23 2016@author: hhxsym"""import requestsimport jsonimport osimport pymongoimport timefrom bs4 import BeautifulSoupfrom multiprocessin
复制链接

扫一扫

专栏目录