参考《Python 网络数据采集》
爬虫初步
安装BeautifulSoup( 非python 的标准库,需要单独安装)
linux环境下:
sudo apt-get install python-bs4
Mac环境下:
sudo easy_install pip
pip是一个包管理器
pip install beautifulsoup4
如果你同时安装了python 2.x 和 python 3.x 你需要指明python3去运行你写的爬虫的文件
比如 : python3 test_urlopen.py
你安装beautifulsoup的时候可能安装到了python 2.x 而不是 python 3.x ,需要使用:
sudo python3 set.up install
pip3 install beautifulsoup4
Windows环境下:
python3 setup.py install
在终端进入python环境,测试:
from bs4 import BeautifulSoup
如果没有报错就说明导入成功了
否则: 下载windows版本的pip (http://pypi.python.org/pypi/setuptools)
pip install beautilfulsoup4
错误处理:
UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
解决办法: 当你创建一个BeautifulSoup的对象的时候,需要选择一个html 解析器 比如 parser(自带), lxml (需要安装)。
Traceback (most recent call last):
File "test_BeautifulSoup.py", line 1, in <module>
from urllib.request import urlopen
ImportError: No module named request
用python3 xxx.py
importError 一般都是缺少库 ,只要运行一下pip3 install xxx就行了
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj=BeautifulSoup(html,"html.parser")
nameList=bsObj.findAll("span", {"class":"green"})
for name in nameList:
print(name.get_text())
效果图:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
try:
html=urlopen(url)
except (HTTPError,URLError) as e:
return None
try:
bsObj=BeautifulSoup(html.read(), "html.parser")
title=bsObj.body.h1
except AttributeError as e:
return none
return title
title=getTitle("http://www.pythonscraping.com/pages/page1.html")
if title==None:
print("Title could not be found!")
else:
print(title)
效果图:
find 和 findall 的区别及用法
findAll(tag, attributes,recursive, text, limit, keywords) recursive = true(默认)支持递归查找
find(tag, attributes,recursive, text, keywords) 等价于 findAll中limit=1 范围限制 前limit项
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj=BeautifulSoup(html, "html.parser")
for child in bsObj.find("table", {"id":"giftList"}).children:
print(child)
效果图:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj=BeautifulSoup(html, "html.parser")
images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:
print(image["src"])
效果图:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("https://en.wikipedia.org/wiki/Eric_Idle")
bsObj = BeautifulSoup(html, "html.parser")
for link in bsObj.findAll("a"):
if 'href' in link.attrs:
print(link.attrs['href'])
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html=urlopen("https://en.wikipedia.org"+articleUrl)
bsObj=BeautifulSoup(html, "html.parser")
return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links)>0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
def getInternalLinks(bsObj, includeUrl):
internalLinks=[]
for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
if link.attrs['href'] not in internalLinks:
internalLinks.append(link.attrs['href'])
return internalLinks
def getExternalLinks(bsObj, excludeUrl):
externalLinks = []
for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
if link.attrs['href'] not in externalLinks:
externalLinks.append(link.attrs['href'])
return externalLinks
def splitAddress(address):
addressParts = address.replace("http://", "").split("/")
return addressParts
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html, "html.parser")
externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])
if len(externalLinks) == 0:
internalLinks = getInternalLinks(startingPage)
return getNextExternalLink(internalLinks[random.randint(0,len(interalLinks)-1)])
else:
return externalLinks[random.randint(0, len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink("https://oreilly.com")
print("随即外链:"+ externalLink)
followExternalOnly(externalLink)
followExternalOnly("https://oreilly.com")
ctrl+c结束
import os
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup
downloadDirectory = "downloaded"
baseUrl = "http://pythonscraping.com"
def getAbsoluteURL(baseUrl, source):
if source.startswith("http://www."):
url = "http://"+source[11:]
elif source.startswith("http://"):
url = source
elif suorce.startswith("www."):
url=source[4:]
url="http://"+source
else:
url = baseUrl+"/"+source
if baseUrl not in url:
return None
return url
def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
path=absoluteUrl.replace("www.", "")
path=path.replace(baseUrl, "")
path=downloadDirectory+path
directory=os.path.dirname(path)
if not os.path.exists(directory):
os.makedirs(directory)
return path
html = urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html, "html.parser")
downloadList = bsObj.findAll(src=True)
for download in downloadList:
fileUrl = getAbsoluteURL(baseUrl, download["src"])
if fileUrl is not None:
print(fileUrl)
urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))
会在当前文件创建一个downloaded 目录 /downloaded/img 抓取下来的图片保存在这里。
Python 连接数据库
1.需要安装mysql , 成功安装后。
2.安装pymysql 模块
可以直接使用pip包管理工具
pip3 install pymysql
或者使用github上的
curl -L https://github.com/PyMySQL/PyMySQL/tarball/pymysql-0.6.2 | tar xz
cd PyMySQL-PyMySQL-f953785
python3 setup.py install(如果用的是python2.x 把3去掉)
sudo mysql -uroot -p
进入数据库后
create database scraping;
use scraping;
create table pages (id BIGINT(7) not null AUTO_INCREMENT, title varchar(200), content varchar(10000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, primary key(id));
import pymysql
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='000000', db='mysql')
cur = conn.cursor()
cur.execute("USE scraping")
cur.execute("SELECT *FROM pages where id=1")
print(cur.fetchone())
cur.close()
conn.close()
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
import pymysql
conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='000000', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE scraping")
random.seed(datetime.datetime.now())
def store(title, content):
cur.execute("INSERT INTO pages (title, content) VALUES (\"%s\",\"%s\")",(title, content))
cur.connection.commit()
def getLinks(articleUrl):
html = urlopen("https://en.wikipedia.org"+articleUrl)
bsObj=BeautifulSoup(html, "html.parser")
title = bsObj.find("h1").get_text()
content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()
store(title, content)
return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
try:
while len(links)>0:
newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
print(newArticle)
links = getLinks(newArticle)
finally:
cur.close()
conn.close()
CREATE DATABASE wikipedia;
use wikipedia;
CREATE TABLE pages( id INT NOT NULL AUTO_INCREMENT, url varchar(255) not NULL, created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id));
CREATE TABLE link( id INT NOT NULL AUTO_INCREMENT, fromPageId INT NULL, toPageId INT NULL, created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id));
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import pymysql
conn=pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='000000', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE wikipedia")
def insertPageIfNotExists(url):
cur.execute("SELECT *FROM pages WHERE url = %s", (url))
if cur.rowcount == 0:
cur.execute("INSERT INTO pages (url) VALUES (%s)", (url))
conn.commit()
return cur.lastrowid
else:
return cur.fetchone()[0]
def insertLink(fromPageId, toPageId):
cur.execute("SELECT * FROM link WHERE fromPageId = %s AND toPageId = %s", ((fromPageId,(toPageId))))
if cur.rowcount == 0:
cur.execute("INSERT INTO link (fromPageId, toPageId) VALUES (%s, %s)", ((fromPageId, (toPageId))))
conn.commit()
pages = set()
def getLinks(pageUrl, recursionLevel):
global pages
if recursionLevel > 4:
return ;
pageId = insertPageIfNotExists(pageUrl)
html = urlopen("https://en.wikipedia.org"+pageUrl)
bsObj = BeautifulSoup(html, "html.parser")
for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
insertLink(pageId, insertPageIfNotExists(link.attrs['href']))
if link.attrs['href'] not in pages:
newPage = link.attrs['href']
pages.add(newPage)
getLinks(newPage, recursionLevel+1)
getLinks("/wiki/Kevin_Bacon", 0)
cur.close()
conn.close()
打开pdf文件
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
def readPDF(pdfFile):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()
提交文件:
import requests
files = {'uploadFile': open('./files/test.png','rb')}
r=requests.post("http://pythonscraping.com/pages/processing2.php", files=files)
print(r.text)
提交表单的三种方法:
#python_http.py
import requests
from requests.auth import AuthBase
from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('ryan', 'password')
r=requests.post(url="http://pythonscraping.com/pages/auth/login.php", auth=auth)
print(r.text)
#python_requests.py
import requests
params = {'username':'Ryan', 'password': 'password'}
r = requests.post("http://pythonscraping.com/pages/cookies/welcome.php", params)
print("Cookie is set to:")
print(r.cookies.get_dict())
print("--------------")
print("Going to profile page...")
r=requests.get("http://pythonscraping.com/pages/cookies/profile.php", cookies=r.cookies)
print(r.text)
#python_session.py
import requests
session = requests.Session()
params = {'username':'username', 'password':'password'}
s=session.post("http://pythonscraping.com/pages/cookie/welcome.php", params)
print("Cookie is set to:")
print(s.cookies.get_dict())
print("---------")
print("Going to profile page...")
s=session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(s.text)
伪造http的头部
import requests
from bs4 import BeautifulSoup
session = requests.Session()
headers={"User-Agent":"Mozillla/5.0 (Macintosh;Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome", "Accept":"text/html, application/xhtml+xml, application/xml;q=0.9,image/webp,*/*;q=0.8"}
url="https://www.whatismybrowser.com/developers/what-http-headers-is-my-browser-sending"
req=session.get(url, headers=headers)
bsObj=BeautifulSoup(req.text, "html.parser")
print(bsObj.find("table", {"class":"table-striped"}).get_text)