数据的获取
1、数据简单请求:
import requests
url = 'https://www.baidu.com'
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}
data = requests.get(url, headers=headers)
print(data.test)
2、解析数据:
from bs4 import BeautifulSoup
soup = BeautifulSoup(data.text, 'lxml)
print(soup)
soup主要有:soup.find(),soup.find_all()
3、数据的存储:
import pandas as pd
result = pd.DataFrame() //创建空的DataFrame数据框,便于数据的存储
result['img_ulr'] = img_urls //将对应的数据填充到数据框
result.to_csv('result.csv,index=None) //存储文件里
4、异常处理:
try:
except .... :
finally:
广义的异常处理:
def get_data(url, num_retries=3):
if (data !=None) and (500 <= data.status_code<600):
if (num_retries > 0):
print("服务器错误,正在重试。。。")
time.sleep(1)
num_retries -= 1
get_data(url, num_retries)
说明:当访问的服务器异常时,可以再次重试访问。
5、动态UA:
from fake_useragent import UserAgent
ua = fake_useragent.UserAgent()
ua.chrome
ua.ie
ua.random // 火狐浏览器
6、编码解决:
import chardet
data = request.get()
charset = chardet.detect(data.content) //检测编码
data.encoding = charset['encoding'] //指定编码
print(data.text)
7、总结前面的:
import re
import time
import chardet
import requests
import urllib.robotparser
from fake_useragent import UserAgent
# 获取headers
def get_headers():
ua = UserAgent()
user_agent = ua.random
headers = {'User-Agent': user_agent}
return headers
# 这里获取代理IP的函数直接给出了proxies,
# 我们也可以用此函数去爬取免费的代理IP,因为不是重点,这里不再赘述
def get_proxies():
proxies = {
"http": "125.88.74.122:84",
"http": "123.84.13.240:8118",
"https": "94.240.33.242:3128"
}
return proxies
# robots.txt检测
def robot_check(robotstxt_url, headers, url):
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robotstxt_url)
rp.read()
result = rp.can_fetch(headers['User-Agent'], url)
return result
# 获取网页数据, 这里我们没有返回data.text,
# 因为抓取图片图片时返回的应该是data.content
def get_data(url, num_retries=3, proxies=None):
try:
data = requests.get(url, timeout=5, headers=headers)
print(data.status_code)
except requests.exceptions.ConnectionError as e:
print("请求错误, url:", url)
print("错误详情:", e)
data = None
except: # other error
print("未知错误, url:", url)
data = None
if (data != None) and (500 <= data.status_code < 600):
if (num_retries > 0):
print("服务器错误,正在重试...")
time.sleep(1)
num_retries -= 1
get_data(url, num_retries, proxies=proxies)
return data
# 对网页内容的解析,提取和存储等操作
def parse_data(data):
if data == None:
return None
charset = chardet.detect(data.content)
data.encoding = charset['encoding']
html_text = data.text
'''
对网页数据的解析提取等操作,假设这里要获取网页的title
'''
interesting_data = re.findall('<title>(.*?)</title>', html_text)
return interesting_data
if __name__ == '__main__':
headers = get_headers()
proxies = get_proxies()
data = get_data("http://www.baidu.com", num_retries=3, proxies=proxies)
interesting_data = parse_data(data)
print(interesting_data)
8、模拟登陆:
首先经过表单登陆,获取Cookie,然后将其保存到文件,之后直接从文件获取cookie,传给session对象,进行后续的数据获取。
9、验证码问题:
- 手动输入的方式
- pytesseract的方式
- 云大码平台
做成一个文件,用于后面的调取使用:
import json
import time
import requests
def getcode_from_yundama():
captcha_username = '你的用户名'
captcha_password = '你的密码'
captcha_id = 1
captcha_appkey = '你的KEY'
captcha_codetype = '3000'
captcha_url = 'http://api.yundama.com/api.php?method=upload'
captcha_result_url = 'http://api.yundama.com/api.php?cid{}&method=result'
filename = 'douban.jpg'
timeout = 30
postdata = {'method': 'upload', 'username': captcha_username,
'password': captcha_password, 'appid': captcha_id,
'appkey': captcha_appkey, 'codetype': captcha_codetype,
'timeout': timeout}
fo = open(filename, 'rb')
file = {'file': fo.read()}
response = requests.post(captcha_url, postdata, files=file).text
print(response)
fo.close()
response = json.loads(response)
code = response['text']
status = response['ret']
if status == 0:
print("识别成功!")
print('验证码为:', code)
return code
10、动态加载内容:
-
直接获取:根据正则表达式获取(re.findall())
re_data = re.findall(‘pcMiaoShaAreaList(({.*}))’, data.text)[0]
json_data = json.loads(re_data)
-
通过Selenium获取。
无头浏览器方式: import time from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities //定制请求头 def run(): login_url = 'https://accounts.douban.com/login' # 要打开的页面 dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0") driver = webdriver.PhantomJS('/home/shensir/phantomjs-2.1.1-linux-x86_64/bin/phantomjs', desired_capabilities=dcap) driver.get(login_url) # 打开网页 time.sleep(5) # 等待5s,使得网页加载完全 # 获取登录页面的初始图片 driver.get_screenshot_as_file('before-login.png') # html = driver.page_source # 获取当前网页源码 # print(html) # 填写帐号密码登录 driver.find_element_by_xpath('//*[@id="email"]').send_keys('你的帐号') driver.find_element_by_xpath('//*[@id="password"]').send_keys('你的密码') time.sleep(3) # 获取填写信息后的页面 driver.get_screenshot_as_file('after-insert.png') # 点击登录 driver.find_element_by_xpath('//*[@id="lzform"]/div[6]/input').click() # 查看登陆后的界面 time.sleep(3) driver.get_screenshot_as_file('after-login.png') ''' 进行一些登录后的操作 html = driver.get('http://...') getdata(html) ''' # 若程序异常中断,driver不会自动释放 # 所以实际使用时最好就上异常处理,保证driver的释放 driver.quit() if __name__ == '__main__': run()
11、多线程和多进程:
下面根据代码进行认识:
import time
import requests
import concurrent
from concurrent import futures
import pandas as pd
import threading
from multiprocessing import Pool
# 装饰器,打印函数的执行时间
def gettime(func):
def warapper(*args, **kwargs):
print("=" * 50)
print(func.__name__, 'Start...')
starttime = time.time()
func(*args)
endtime = time.time()
spendtime = endtime - starttime
print(func.__name__, "End...")
print("Spend", spendtime, "s totally")
print("=" * 50)
return warapper
# 从文件取n个网址测试
def get_urls_from_file(n):
df = pd.read_csv('TestUrls.csv') # 共1000个网址
urls = list(df['url'][:n])
return urls
# 请求并解析网页获取数据(这里简单把要获取的数据设为网页源码)
def getdata(url, retries=3):
# print("正在下载:", url)
headers = {}
try:
html = requests.get(url, headers=headers)
# print(html)
except requests.exceptions.ConnectionError as e:
# print('下载出错[ConnectionError]:', e)
html = None
# 5xx 错误为服务器错误,我们可以进行重新请求
if (html != None and 500 <= html.status_code < 600 and retries):
retries -= 1
# print('服务器错误正在重试...')
getdata(url, retries)
data = html.text
else:
data = None
return data
# 串行
@gettime
def Mynormal():
for url in urls:
getdata(url)
# 进程池
@gettime
def MyprocessPool(num=10):
pool = Pool(num)
results = pool.map(getdata, urls)
pool.close()
pool.join()
return results
# 多线程
@gettime
def Mymultithread(max_threads=10):
# 对urls的处理
def urls_process():
while True:
try:
# 从urls末尾抽出一个url
url = urls.pop()
except IndexError:
# urls爬取完毕,为空时,结束
break
data = getdata(url, retries=3)
'''
这里是对网页数据的提取与存储操作
'''
threads = []
# 未达到最大线程限制且仍然存在带爬取的url时,可以创建新的线程进行加速
while int(len(threads) < max_threads) and len(urls):
thread = threading.Thread(target=urls_process)
# print('创建线程', thread.getName())
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
# 线程池
@gettime
def Myfutures(num_of_max_works=10):
with concurrent.futures.ThreadPoolExecutor(max_workers=num_of_max_works) as executor:
executor.map(getdata, urls)
if __name__ == '__main__':
# 取100个网页做测试
urls = get_urls_from_file(100)
Mynormal() # 串行
MyprocessPool(10) # 进程池
Myfutures(10) # 线程池
Mymultithread(10) # 多线程
'''
100个网页
==================================================
Mynormal Start...
Mynormal End...
Spend 20.605727672576904 s totally
==================================================
==================================================
MyprocessPool Start...
MyprocessPool End...
Spend 2.4525890350341797 s totally
==================================================
==================================================
Mymutithread Start...
Mymutithread End...
Spend 2.1947641372680664 s totally
==================================================
==================================================
Myfutures Start...
Myfutures End...
Spend 2.1515889167785645 s totally
==================================================
'''