实践1 – 搭建Python开发环境
在Linux中安装配置Python3(多版本)
- 在centos下编译安装python3前,建议先将yum源更改为国内的yum源
- 安装编译所需的工具
yum install -y gcc zlib zlib-devel libffi-devel openssl openssl-devel
- 下载并编译安装Python3
wget https://www.python.org/ftp/python/3.6.2/Python-3.6.2.tgz
tar -xvf Python-3.6.2.tgz
cd Python-3.6.2
./configure --with-ssl
make && make install
- 检查
python3
pip3
使用Virtualenv创建独立的Python运行环境
- pip安装建议使用国内的pypi源,否则经常会因为网络超时导致模块安装失败
1.安装virtualenv
pip3 install -i https://pypi.doubanio.com/simple/ virtualenv
- 创建python虚拟环境
mkdir myspace #工作目录
cd myspace
virtualenv -p python3 venv
- 激活python虚机环境
. venv/bin/activate
- 退出python虚机环境
deactivate
实践2 – urllib和Requests的使用
urllib使用
- 发起GET请求
# http_get.py
from urllib import request
response = request.urlopen("http://www.baidu.com/")
print(response.read())
- 发起带参数的请求
# http_params.py
from urllib import request, parse
url = 'http://www.baidu.com/s?'
params = {'word': 'Python爬虫',
'tn': 'baiduhome_pg',
'ie': 'utf-8'}
url = url + parse.urlencode(params)
# print(url)
with request.urlopen(url) as response:
with open("response.html", "wb") as file:
file.write(response.read())
- 发起POST请求
# http_post.py
from urllib import request,parse
data = parse.urlencode({'terms': 'here is test'}).encode()
req = request.Request('http://httpbin.org/post?q=Python', data=data)
with request.urlopen(req) as response:
print(response.read())
requests使用
- 使用前需安装Requests模块
pip install -i https://pypi.doubanio.com/simple/ requests
- 文档 https://2.python-requests.org//zh_CN/latest/index.html
- 发起http请求
# req_http.py
import requests
# 发起get请求,并传递参数
r1 = requests.get('https://httpbin.org/get', params={'terms': 'here is test'})
print(r1.url)
print(r1.status_code)
print(r1.content)
# 发起post请求
r2 = requests.post('https://httpbin.org/post', data={'terms': 'here is test'})
print(r2.content)
- 常见设置
# req_header.py
import requests
# 自定义请求头
headers = {'user-agent': 'Mozilla/5.0'}
r1 = requests.get('http://httpbin.org/headers', headers=headers)
print(r1.text)
# 指定Cookie
cookies = {'from-my': 'browser'}
r2 = requests.get('http://httpbin.org/cookies', cookies=cookies)
print(r2.text)
# 设置超时
r3 = requests.get('https://www.baidu.com', timeout=5)
# 设置代理 西刺https://www.xicidaili.com/
proxy = {
'http': 'http://112.85.170.175:9999',
'https': 'https://118.190.73.168:808',
}
r4 = requests.get('http://www.kuaidaili.com/free/', proxies=proxy, timeout=2)
print(r4.content)
# Session
s = requests.Session()
s.cookies = requests.utils.cookiejar_from_dict({"a": "c"})
r5 = s.get('http://httpbin.org/cookies')
print(r5.text)
r5 = s.get('http://httpbin.org/cookies')
print(r5.text)
实践3 – Beautiful Soup的使用
- 使用前需安装beautiful soup4模块
pip install -i https://pypi.doubanio.com/simple/ beautifulsoup4
- 文档: https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
使用bs4解析网页
- 基于标签Tag查找
# douban_top250.py
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
rsq = requests.get('https://movie.douban.com/top250', headers=headers)
html = rsq.text
soup = BeautifulSoup(html, 'html.parser')
# 基于标签查找
divs = soup.find_all('div', class_='hd')
for div in divs:
print(div.a.span.string)
next_link = soup.find('span', class_='next')
if next_link is not None:
print(next_link.a['href'])
- 基于CSS selector查找
# douban_top250.py
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
rsq = requests.get('https://movie.douban.com/top250', headers=headers)
html = rsq.text
soup = BeautifulSoup(html, 'html.parser')
# CSS选择器
div_css = soup.select('.item a .title:first-child')
for name in div_css:
print(name.get_text())
link_css = soup.select_one('.next a')
if link_css is not None:
print(link_css['href'])
实践4 – MongoDB基础
MongoDB数据库基本操作
- 启动/连接数据库
cd /path/for/mongodb/bin
./mongod --dbpath /path/for/data/
./mongo
- 创建数据库
use spider
db
show dbs
db.dropDatabase()
- 创建集合
db.createCollection('douban')
show collections
db.douban.drop()
- 创建文档
db.douban.insert({'title': '豆瓣'}) # 如果集合douban不存在也会自动创建
db.douban.find()
db.douban.update({'title': '豆瓣'}, {$set:{'title': '豆瓣爬虫'}})
db.douban.remove({})
pymongo操作数据库
- 使用前需安装pymongo模块
pip install -i https://pypi.doubanio.com/simple/ pymongo
- pymongo操作数据库
import pymongo
client = pymongo.MongoClient(host="127.0.0.1", port=27017)
db = client["jobs"]
collection = db["jobs_bigdata"]
data = { "title": "肖申克的救赎", "star": 1000, "url": "https://movie.douban.com/subject/1292052/" }
doc = collection.insert_one(data)
print(doc.inserted_id)
for x in collection.find():
print(x)
实践5 – 爬取动态网页
爬取豆瓣电影Top250
- 使用前需安装selenium模块
pip install -i https://pypi.doubanio.com/simple/ selenium
- 文档: https://selenium-python-zh.readthedocs.io/en/latest/
- 下载Chrome WebDriver:http://chromedriver.storage.googleapis.com/index.html
- 模拟登录
# login.py
from selenium import webdriver
browser = webdriver.Chrome(executable_path='f:/bigdata/chromedriver.exe')
browser.get('http://www.baidu.com')
elem = browser.find_element_by_id("kw")
elem.clear()
elem.send_keys('python爬虫')
btn = browser.find_element_by_id("su")
btn.click()
- 加载js
# pulldown.py
from selenium import webdriver
import time
browser = webdriver.Chrome(executable_path='f:/bigdata/chromedriver.exe')
browser.get('https://www.oschina.net/home/login?goto_page=https%3A%2F%2Fwww.oschina.net%2Fblog')
# 页面未加载完成会导致无法找到对象
time.sleep(5)
browser.find_element_by_css_selector("#userMail").send_keys("******")
browser.find_element_by_css_selector("#userPassword").send_keys("******")
browser.find_element_by_css_selector(".btn.btn-green.block.btn-login").click()
for i in range(3):
script = "window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage;"
browser.execute_script(script)
time.sleep(3)