1、基础内容
# __name__是一种变量
print(__name__)
# 列表解析式
x = [i for i in range(10) if i % 2 == 0]
print(x)
# 装饰器 --> 加强,一般用加强函数和类
import time
def printtime(func):
def f(*args, **kwargs):
print(time.ctime())
return func(*args, **kwargs)
return f
@printtime
def printHello():
print("hello world")
printHello()
# 递归函数
def jcN(n):
if n == 1:
return n
return n * jcN(n - 1)
print(jcN(66))
2、数据的获取
请求网页数据并解析
以爬取豆瓣网的新书速递为例
import requests
from bs4 import BeautifulSoup as bs
# 请求数据
url = "https://book.douban.com/latest"
# headers里面大小写均可
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"}
data = requests.get(url, headers=headers)
# 解析数据
soup = bs(data.text, "lxml")
# 观察网页元素
books_left = soup.find('ul', {"class": "cover-col-4 clearfix"})
books_left = books_left.find_all("li")
books_right = soup.find('ul', {"class": "cover-col-4 pl20 clearfix"})
books_right = books_right.find_all("li")
books = list(books_left) + list(books_right)
# 对每个图书区进行相同的操作,获取图书信息
img_urls = []
titles = []
ratings = []
authors = []
details = []
for book in books:
#图书封面图片url
img_url = book.find_all('a')[0].find("img").get("src")
img_urls.append(img_url)
#图书标题
title = book.find_all("a")[1].get_text()
titles.append(title)
#评价星级
rating = book.find("p", {"class" : "rating"}).get_text()
rating = rating.replace("\n", "").replace(" ", "")
ratings.append(rating)
#作者
author = book.find("p", {"class" : "color-gray"}).get_text()
author = author.replace("\n", "").replace(" ", "")
authors.append(author)
#简介
detail = book.find_all("p")[2].get_text()
detail = detail.replace("\n", "").replace(" ", "")
details.append(detail)
# 总结
print("img_urls : ", img_urls)
print("titles : ", titles)
print("ratings : ", ratings)
print("authtors : ", authors)
print("details : ", details)
数据的存储
将数据储存为csv格式
import pandas as pd
# 存储数据
result = pd.DataFrame()
result["img_urls"] = img_urls
result["titles"] = titles
result["ratings"] = ratings
result["authors"] = authors
result["details"] = details
result.to_csv("result.csv", index=None)
提升一下代码的可读性
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
# 请求数据
def get_data():
url = "https://book.douban.com/latest"
# headers里面大小写均可
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:68.0) Gecko/20100101 Firefox/68.0"}
data = requests.get(url, headers=headers)
return data
# 解析数据
def parse_data(data):
soup = bs(data.text, "lxml")
# 观察网页元素
books_left = soup.find('ul', {"class": "cover-col-4 clearfix"})
books_left = books_left.find_all("li")
books_right = soup.find('ul', {"class": "cover-col-4 pl20 clearfix"})
books_right = books_right.find_all("li")
books = list(books_left) + list(books_right)
# 对每个图书区进行相同的操作,获取图书信息
img_urls = []
titles = []
ratings = []
authors = []
details = []
for book in books:
# 图书封面图片url
img_url = book.find_all('a')[0].find("img").get("src")
img_urls.append(img_url)
# 图书标题
title = book.find_all("a")[1].get_text()
titles.append(title)
# 评价星级
rating = book.find("p", {"class": "rating"}).get_text()
rating = rating.replace("\n", "").replace(" ", "")
ratings.append(rating)
# 作者
author = book.find("p", {"class": "color-gray"}).get_text()
author = author.replace("\n", "").replace(" ", "")
authors.append(author)
# 简介
detail = book.find_all("p")[2].get_text()
detail = detail.replace("\n", "").replace(" ", "")
details.append(detail)
return img_urls, titles, ratings, authors, details
# 存储数据
def save_data(img_urls, titles, ratings, authors, details):
result = pd.DataFrame()
result["img_urls"] = img_urls
result["titles"] = titles
result["ratings"] = ratings
result["authors"] = authors
result["details"] = details
result.to_csv("result.csv", index=None)
# 开始爬取
def run():
data = get_data()
img_urls, titles, ratings, authtors, details = parse_data(data)
save_data(img_urls, titles, ratings, authtors, details)
if __name__ == "__main__":
run()
也可以在get()中设置代理IP
如果get()返回的值有明显乱码则使用chardet改进
import chardet
import requests
import chardet
data = requests.get("http://baidu.com")
charset = chardet.detect(data.content)
print(charset)
# 输出结果为 : {'encoding': 'ascii', 'confidence': 1.0, 'language': ''}
data.encoding = charset["encoding"]
print(data.text)
动态UA
import fake_useragent
ua = fake_useragent.UserAgent()
for i in range(10):
print(ua.random)
#输出结果
# Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6
# Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0
# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36
# Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
# Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36
# Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17
# Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36
模拟登录
import requests
import pickle
import fake_useragent
from bs4 import BeautifulSoup
url = "https://accounts.douban.com/login"
# 提交表单登录并获取cookie
def get_cookie_from_net(url):
# 构建表单
payload = "ck=&name=**id**&password=**password**&remember=false&ticket="
data = s.post(url, headers= headers, data= payload, verify= True) #绕过了SSL验证
with open("cookies.douban", "wb") as f:
cookiedict = requests.utils.dict_from_cookiejar(s.cookies)
pickle.dump(cookiedict, f)
print("提交表单的登录,成功获取cookies...")
return s.cookies
# 从cookie文件获取cookie
def get_cookie_from_file():
with open("cookies.douban", "rb") as f:
cookiedict = pickle.load(f)
cookies = requests.utils.cookiejar_from_dict(cookiedict)
print("解析文件,成功提取cookies...")
return cookies
# 假设这里要获取自己的签名数据
def getdata(html):
soup = BeautifulSoup(html.text, "lxml")
mydata = soup.select("#display")[0].get_text()
"""
这里进行登录后其他数据的获取及存储,这里仅仅获取了自己的签名数据
"""
return mydata
def login_and_getdata(url):
print("获取cookies...")
try:
s.cookies = get_cookie_from_file()
except:
print("从文件获取cookies失败...\n正在尝试提交表单登录以获取...")
s.cookies = get_cookie_from_net(url)
html = s.get("https://www.douban.com/people/210137543/", headers= headers)
data = getdata(html)
print(data)
if __name__ == "__main__":
# 一些全局变量
s = requests.session() #对象能够跨请求保持某些参数
ua = fake_useragent.UserAgent()
headers = {"User-Agent": ua.random}
#登录并获取数据
login_and_getdata(url)
验证码问题
import pytesseract
from PIL import Image
img = Image.open("1.png")
# 手动输入
Image._show(img)
captha_img = str(input("输入验证码:"))
# 自动
content = pytesseract.image_to_string(img)
print(content)
动态加载内容的获取
1、直接请求
2、selenium
import time
from selenium import webdriver
def getdata():
pass
def run():
json_url = "https://www.csdn.net/" #要打开的页面
# 实例化webdriver,选择firefox浏览器
driver = webdriver.Firefox()
# 打开网页
driver.get(json_url)
# 等待5秒时网页加载完成
time.sleep(5)
# 获取当前网页的源码
html = driver.page_source
print(html)
# 这里是对网页数据的解析处理
data = getdata(html)
# 释放
driver.quit()
return data
if __name__ == "__main__":
run()