数据抓取实战demo

话不多说,直接上代码

import requests
import pymongo
from pyquery import PyQuery as pq
import re
import logging
from urllib.parse import urljoin


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')


BASE_URL = 'https://static1.scrape.cuiqingcai.com'
PAGE_SIZE = 10

MONGO_CONNECT_STRING = 'mongodb://'



def scrape_page(url):
    """
    页面内容抓取
    :param url:
    :return:
    """
    logging.info(f'scraping {url} starting ...')
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        logging.error(f'scrapy error. invalid status code:{response.status_code} url:{url}')
    except requests.RequestException:
        logging.error(f'scrapy error. url:{url}', exc_info=True) # 打印错误信息


def scrape_index(page):
    """
    抓取列表页
    :param page:
    :return:
    """
    index_url = f'{BASE_URL}/page/{page}'
    return scrape_page(index_url)


def scrape_detail(url):
    """
    抓取详情
    :param url:
    :return:
    """
    return scrape_page(url)


def parse_index(html):
    """
    列表页面数据解析
    :param html:
    :return:
    """
    doc = pq(html)
    links = doc('.el-card .name')
    for link in links.items():
        href = link.attr('href')
        detail_url = urljoin(BASE_URL, href)
        logging.info(f'get detail url: {detail_url}')
        yield detail_url


def parse_detail(html):
    """
    解析详情
    :param html:
    :return:
    """
    doc = pq(html)
    cover = doc('img.cover').attr('src')
    name = doc('a h2').text()
    categories = [category.text() for category in doc('.categories span').items()]
    published_at = doc('.info span:contains(上映)').text()
    published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
        if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None
    drama = doc('.drama p').text()
    score = doc('p.score').text()
    score = float(score) if score else None
    return {
        'cover': cover,
        'name': name,
        'categories': categories,
        'published_at': published_at,
        'drama': drama,
        'score': score
    }







def main():
    for page in range(1, PAGE_SIZE+1):
        logging.info(f'current page: {page}')
        html = scrape_index(page)
        detail_urls = parse_index(html)
        for detail_url in detail_urls:
            logging.info(f'scrape detail url:{detail_url}')
            detail_html = scrape_detail(detail_url)
            detail_data = parse_detail(detail_html)
            logging.info(f'scrape detail data:{detail_data}')


if __name__ == '__main__':
    main()

初级抓取已经完成,接下来会对抓取的内容进行存储(mongo)

 

为了加快抓取,我会使用多进程的方式进行优化

import requests
import pymongo
from pyquery import PyQuery as pq
import re
import logging
from urllib.parse import urljoin
import multiprocessing


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')


BASE_URL = 'https://static1.scrape.cuiqingcai.com'
PAGE_SIZE = 10


MONGO_USER = 'root'
MONGO_PWD = '123456'
MONGO_CONNECT_STRING = f'mongodb://{MONGO_USER}:{MONGO_PWD}@127.0.0.1:27017/'

MONGO_DB_NAME = 'movies'
MONGO_COLLECTION_NAME = 'movies3'

client = pymongo.MongoClient(MONGO_CONNECT_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]



def scrape_page(url):
    """
    页面内容抓取
    :param url:
    :return:
    """
    logging.info(f'scraping {url} starting ...')
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        logging.error(f'scrapy error. invalid status code:{response.status_code}    url:{url}')
    except requests.RequestException:
        logging.error(f'scrapy error. url:{url}', exc_info=True) # 打印错误信息


def scrape_index(page):
    """
    抓取列表页
    :param page:
    :return:
    """
    index_url = f'{BASE_URL}/page/{page}'
    return scrape_page(index_url)


def scrape_detail(url):
    """
    抓取详情
    :param url:
    :return:
    """
    return scrape_page(url)


def parse_index(html):
    """
    列表页面数据解析
    :param html:
    :return:
    """
    doc = pq(html)
    links = doc('.el-card .name')
    for link in links.items():
        href = link.attr('href')
        detail_url = urljoin(BASE_URL, href)
        logging.info(f'get detail url: {detail_url}')
        yield detail_url


def parse_detail(html):
    """
    解析详情
    :param html:
    :return:
    """
    doc = pq(html)
    cover = doc('img.cover').attr('src')
    name = doc('a h2').text()
    categories = [category.text() for category in doc('.categories span').items()]
    published_at = doc('.info span:contains(上映)').text()
    published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
        if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None
    drama = doc('.drama p').text()
    score = doc('p.score').text()
    score = float(score) if score else None
    return {
        'cover': cover,
        'name': name,
        'categories': categories,
        'published_at': published_at,
        'drama': drama,
        'score': score
    }


def save_data(data):
    """
    数据存储
    :param data: 详情
    :return:
    """
    collection.update_one({
        'name': data.get('name') # 查询条件
    },{
        '$set': data # 具体更新的数据
    }, upsert=True) # 存在则更新,否则插入





def main(page):
    logging.info(f'current page: {page}')
    html = scrape_index(page)
    detail_urls = parse_index(html)
    for detail_url in detail_urls:
        logging.info(f'scrape detail url:{detail_url}')
        detail_html = scrape_detail(detail_url)
        detail_data = parse_detail(detail_html)
        logging.info(f'scrape detail data:{detail_data}')
        logging.info('save data to mongo')
        save_data(detail_data)
        logging.info('save to mongo successfully')



if __name__ == '__main__':
    pool = multiprocessing.Pool()
    pages = range(1, PAGE_SIZE+1)
    pool.map(main, pages)
    pool.close()
    pool.join()

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值