python3 爬虫实例_Python3爬虫三大案例实战分享-天善学院

课程链接:https://edu.hellobi.com/course/156/lessons

课程一:分析Ajax抓取今日头条街拍美图

课程源码1:今日头条的图片栏目,图片地址和网页地址采集,然后下载后,存储到mangodb中

import json

import os

from urllib.parse import urlencode

import pymongo

import requests

from bs4 import BeautifulSoup

from requests.exceptions import ConnectionError

import re

from multiprocessing import Pool

from hashlib import md5

from json.decoder import JSONDecodeError

from config import *

client = pymongo.MongoClient(MONGO_URL, connect=False)

db = client[MONGO_DB]

def get_page_index(offset, keyword):

data = {

'autoload': 'true',

'count': 20,

'cur_tab': 3,

'format': 'json',

'keyword': keyword,

'offset': offset,

}

params = urlencode(data)

base = 'http://www.toutiao.com/search_content/'

url = base + '?' + params

try:

response = requests.get(url)

if response.status_code == 200:

return response.text

return None

except ConnectionError:

print('Error occurred')

return None

def download_image(url):

print('Downloading', url)

try:

response = requests.get(url)

if response.status_code == 200:

save_image(response.content)

return None

except ConnectionError:

return None

def save_image(content):

file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')

print(file_path)

if not os.path.exists(file_path):

with open(file_path, 'wb') as f:

f.write(content)

f.close()

def parse_page_index(text):

try:

data = json.loads(text)

if data and 'data' in data.keys():

for item in data.get('data'):

yield item.get('article_url')

except JSONDecodeError:

pass

def get_page_detail(url):

try:

response = requests.get(url)

if response.status_code == 200:

return response.text

return None

except ConnectionError:

print('Error occurred')

return None

def parse_page_detail(html, url):

soup = BeautifulSoup(html, 'lxml')

result = soup.select('title')

title = result[0].get_text() if result else ''

images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)

result = re.search(images_pattern, html)

if result:

data = json.loads(result.group(1).replace('\\', ''))

if data and 'sub_images' in data.keys():

sub_images = data.get('sub_images')

images = [item.get('url') for item in sub_images]

for image in images: download_image(image)

return {

'title': title,

'url': url,

'images': images

}

def save_to_mongo(result):

if db[MONGO_TABLE].insert(result):

print('Successfully Saved to Mongo', result)

return True

return False

def main(offset):

text = get_page_index(offset, KEYWORD)

urls = parse_page_index(text)

for url in urls:

html = get_page_detail(url)

result = parse_page_detail(html, url)

if result: save_to_mongo(result)

if __name__ == '__main__':

pool = Pool()

groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

pool.map(main, groups)

pool.close()

pool.join()

课程源码2:config.py

MONGO_URL = 'localhost'

MONGO_DB = 'toutiao'

MONGO_TABLE = 'toutiao'

GROUP_START = 1

GROUP_END = 20

KEYWORD='街拍'

重点:提取链接的方式,可以学习下。

def parse_page_index(html):

data = json.loads(html)

if data and 'data' in data.keys():

for item in data.get('data'):

yield item.get('article_url')

Requests+正则表达式抓取猫眼电影TOP100

课程源码1:核心源码,提取top100的内容和链接

import re

from selenium import webdriver

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pq

from config import *

import pymongo

client = pymongo.MongoClient(MONGO_URL)

db = client[MONGO_DB]

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)

wait = WebDriverWait(browser, 10)

browser.set_window_size(1400, 900)

def search():

print('正在搜索')

try:

browser.get('https://www.taobao.com')

input = wait.until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))

)

submit = wait.until(

EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))

input.send_keys(KEYWORD)

submit.click()

total = wait.until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))

get_products()

return total.text

except TimeoutException:

return search()

def next_page(page_number):

print('正在翻页', page_number)

try:

input = wait.until(

EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))

)

submit = wait.until(EC.element_to_be_clickable(

(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))

input.clear()

input.send_keys(page_number)

submit.click()

wait.until(EC.text_to_be_present_in_element(

(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)))

get_products()

except TimeoutException:

next_page(page_number)

def get_products():

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))

html = browser.page_source

doc = pq(html)

items = doc('#mainsrp-itemlist .items .item').items()

for item in items:

product = {

'image': item.find('.pic .img').attr('src'),

'price': item.find('.price').text(),

'deal': item.find('.deal-cnt').text()[:-3],

'title': item.find('.title').text(),

'shop': item.find('.shop').text(),

'location': item.find('.location').text()

}

print(product)

save_to_mongo(product)

def save_to_mongo(result):

try:

if db[MONGO_TABLE].insert(result):

print('存储到MONGODB成功', result)

except Exception:

print('存储到MONGODB失败', result)

def main():

try:

total = search()

total = int(re.compile('(\d+)').search(total).group(1))

for i in range(2, total + 1):

next_page(i)

except Exception:

print('出错啦')

finally:

browser.close()

if __name__ == '__main__':

main()

课程源码2:config.py

MONGO_URL = 'localhost'

MONGO_DB = 'taobao'

MONGO_TABLE = 'product'

SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

KEYWORD = '美食'

重点:

分享到:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值