话不多说,直接上代码
import requests
import pymongo
from pyquery import PyQuery as pq
import re
import logging
from urllib.parse import urljoin
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
BASE_URL = 'https://static1.scrape.cuiqingcai.com'
PAGE_SIZE = 10
MONGO_CONNECT_STRING = 'mongodb://'
def scrape_page(url):
"""
页面内容抓取
:param url:
:return:
"""
logging.info(f'scraping {url} starting ...')
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
logging.error(f'scrapy error. invalid status code:{response.status_code} url:{url}')
except requests.RequestException:
logging.error(f'scrapy error. url:{url}', exc_info=True) # 打印错误信息
def scrape_index(page):
"""
抓取列表页
:param page:
:return:
"""
index_url = f'{BASE_URL}/page/{page}'
return scrape_page(index_url)
def scrape_detail(url):
"""
抓取详情
:param url:
:return:
"""
return scrape_page(url)
def parse_index(html):
"""
列表页面数据解析
:param html:
:return:
"""
doc = pq(html)
links = doc('.el-card .name')
for link in links.items():
href = link.attr('href')
detail_url = urljoin(BASE_URL, href)
logging.info(f'get detail url: {detail_url}')
yield detail_url
def parse_detail(html):
"""
解析详情
:param html:
:return:
"""
doc = pq(html)
cover = doc('img.cover').attr('src')
name = doc('a h2').text()
categories = [category.text() for category in doc('.categories span').items()]
published_at = doc('.info span:contains(上映)').text()
published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None
drama = doc('.drama p').text()
score = doc('p.score').text()
score = float(score) if score else None
return {
'cover': cover,
'name': name,
'categories': categories,
'published_at': published_at,
'drama': drama,
'score': score
}
def main():
for page in range(1, PAGE_SIZE+1):
logging.info(f'current page: {page}')
html = scrape_index(page)
detail_urls = parse_index(html)
for detail_url in detail_urls:
logging.info(f'scrape detail url:{detail_url}')
detail_html = scrape_detail(detail_url)
detail_data = parse_detail(detail_html)
logging.info(f'scrape detail data:{detail_data}')
if __name__ == '__main__':
main()
初级抓取已经完成,接下来会对抓取的内容进行存储(mongo)
为了加快抓取,我会使用多进程的方式进行优化
import requests
import pymongo
from pyquery import PyQuery as pq
import re
import logging
from urllib.parse import urljoin
import multiprocessing
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
BASE_URL = 'https://static1.scrape.cuiqingcai.com'
PAGE_SIZE = 10
MONGO_USER = 'root'
MONGO_PWD = '123456'
MONGO_CONNECT_STRING = f'mongodb://{MONGO_USER}:{MONGO_PWD}@127.0.0.1:27017/'
MONGO_DB_NAME = 'movies'
MONGO_COLLECTION_NAME = 'movies3'
client = pymongo.MongoClient(MONGO_CONNECT_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]
def scrape_page(url):
"""
页面内容抓取
:param url:
:return:
"""
logging.info(f'scraping {url} starting ...')
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
logging.error(f'scrapy error. invalid status code:{response.status_code} url:{url}')
except requests.RequestException:
logging.error(f'scrapy error. url:{url}', exc_info=True) # 打印错误信息
def scrape_index(page):
"""
抓取列表页
:param page:
:return:
"""
index_url = f'{BASE_URL}/page/{page}'
return scrape_page(index_url)
def scrape_detail(url):
"""
抓取详情
:param url:
:return:
"""
return scrape_page(url)
def parse_index(html):
"""
列表页面数据解析
:param html:
:return:
"""
doc = pq(html)
links = doc('.el-card .name')
for link in links.items():
href = link.attr('href')
detail_url = urljoin(BASE_URL, href)
logging.info(f'get detail url: {detail_url}')
yield detail_url
def parse_detail(html):
"""
解析详情
:param html:
:return:
"""
doc = pq(html)
cover = doc('img.cover').attr('src')
name = doc('a h2').text()
categories = [category.text() for category in doc('.categories span').items()]
published_at = doc('.info span:contains(上映)').text()
published_at = re.search('(\d{4}-\d{2}-\d{2})', published_at).group(1) \
if published_at and re.search('(\d{4}-\d{2}-\d{2})', published_at) else None
drama = doc('.drama p').text()
score = doc('p.score').text()
score = float(score) if score else None
return {
'cover': cover,
'name': name,
'categories': categories,
'published_at': published_at,
'drama': drama,
'score': score
}
def save_data(data):
"""
数据存储
:param data: 详情
:return:
"""
collection.update_one({
'name': data.get('name') # 查询条件
},{
'$set': data # 具体更新的数据
}, upsert=True) # 存在则更新,否则插入
def main(page):
logging.info(f'current page: {page}')
html = scrape_index(page)
detail_urls = parse_index(html)
for detail_url in detail_urls:
logging.info(f'scrape detail url:{detail_url}')
detail_html = scrape_detail(detail_url)
detail_data = parse_detail(detail_html)
logging.info(f'scrape detail data:{detail_data}')
logging.info('save data to mongo')
save_data(detail_data)
logging.info('save to mongo successfully')
if __name__ == '__main__':
pool = multiprocessing.Pool()
pages = range(1, PAGE_SIZE+1)
pool.map(main, pages)
pool.close()
pool.join()