python 爬虫学习

最新推荐文章于 2020-08-20 12:06:22 发布

zzuli-dk

最新推荐文章于 2020-08-20 12:06:22 发布

阅读量377

点赞数 2

分类专栏： python 文章标签： python 网络爬虫

本文链接：https://blog.csdn.net/fanhansheng/article/details/83274521

版权

python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

Beautiful Soup 4.4.0 文档

参考《Python 网络数据采集》

爬虫初步

安装BeautifulSoup( 非python 的标准库，需要单独安装)

linux环境下：

sudo apt-get install python-bs4

Mac环境下:

sudo easy_install pip

pip是一个包管理器

pip install beautifulsoup4

如果你同时安装了python 2.x 和 python 3.x 你需要指明python3去运行你写的爬虫的文件

比如 : python3 test_urlopen.py

你安装beautifulsoup的时候可能安装到了python 2.x 而不是 python 3.x ，需要使用：

sudo python3 set.up install

pip3 install beautifulsoup4

Windows环境下：

python3 setup.py install

在终端进入python环境，测试：

from bs4 import BeautifulSoup

如果没有报错就说明导入成功了

否则：下载windows版本的pip （http://pypi.python.org/pypi/setuptools）

pip install beautilfulsoup4

错误处理：

UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

解决办法：当你创建一个BeautifulSoup的对象的时候，需要选择一个html 解析器比如 parser（自带）, lxml (需要安装)。

Traceback (most recent call last):

File "test_BeautifulSoup.py", line 1, in <module>

from urllib.request import urlopen

ImportError: No module named request

用python3 xxx.py

importError 一般都是缺少库，只要运行一下pip3 install xxx就行了

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/warandpeace.html")
bsObj=BeautifulSoup(html,"html.parser")
nameList=bsObj.findAll("span", {"class":"green"})
for name in nameList:
 print(name.get_text())

效果图：

from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):
    try:
        html=urlopen(url)
    except (HTTPError,URLError) as e:
        return None
    try:
        bsObj=BeautifulSoup(html.read(), "html.parser")
        title=bsObj.body.h1
    except AttributeError as e:
        return none
    return title
title=getTitle("http://www.pythonscraping.com/pages/page1.html")
if title==None:
    print("Title could not be found!")
else:
    print(title)

效果图：

find 和 findall 的区别及用法

findAll(tag, attributes,recursive, text, limit, keywords) recursive = true（默认）支持递归查找

find(tag, attributes,recursive, text, keywords) 等价于 findAll中limit=1 范围限制前limit项

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj=BeautifulSoup(html, "html.parser")
for child in bsObj.find("table", {"id":"giftList"}).children:
   print(child)

效果图：

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj=BeautifulSoup(html, "html.parser")
images = bsObj.findAll("img",{"src":re.compile("\.\.\/img\/gifts/img.*\.jpg")})
for image in images:  
 print(image["src"])

效果图：

from urllib.request import urlopen
from bs4 import BeautifulSoup
html=urlopen("https://en.wikipedia.org/wiki/Eric_Idle")
bsObj = BeautifulSoup(html, "html.parser")
for link in bsObj.findAll("a"):
 if 'href' in link.attrs:
  print(link.attrs['href'])

from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import random 
import re
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
 html=urlopen("https://en.wikipedia.org"+articleUrl)
 bsObj=BeautifulSoup(html, "html.parser")
 return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links)>0:
 newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
 print(newArticle)
 links = getLinks(newArticle)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())
def getInternalLinks(bsObj, includeUrl):
 internalLinks=[]
 for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
  if link.attrs['href'] not in internalLinks:
   internalLinks.append(link.attrs['href'])
 return internalLinks
def getExternalLinks(bsObj, excludeUrl):
 externalLinks = []
 for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
  if link.attrs['href'] not in externalLinks:
   externalLinks.append(link.attrs['href'])
 return externalLinks
def splitAddress(address):
 addressParts = address.replace("http://", "").split("/")
 return addressParts

def getRandomExternalLink(startingPage):
 html = urlopen(startingPage)
 bsObj = BeautifulSoup(html, "html.parser")
 externalLinks  = getExternalLinks(bsObj, splitAddress(startingPage)[0])
 if len(externalLinks) == 0:
   internalLinks = getInternalLinks(startingPage)
   return getNextExternalLink(internalLinks[random.randint(0,len(interalLinks)-1)])
 else:
  return externalLinks[random.randint(0, len(externalLinks)-1)]

def followExternalOnly(startingSite):
 externalLink = getRandomExternalLink("https://oreilly.com")
 print("随即外链:"+ externalLink)
 followExternalOnly(externalLink)

followExternalOnly("https://oreilly.com")

ctrl+c结束

import os 
from urllib.request import urlretrieve
from urllib.request import urlopen
from bs4 import BeautifulSoup

downloadDirectory = "downloaded"
baseUrl = "http://pythonscraping.com"

def getAbsoluteURL(baseUrl, source):
 if source.startswith("http://www."):
  url = "http://"+source[11:]
 elif source.startswith("http://"):
  url = source
 elif suorce.startswith("www."):
  url=source[4:]
  url="http://"+source
 else:
  url = baseUrl+"/"+source
 if baseUrl not in url:
  return None
 return url

def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory):
 path=absoluteUrl.replace("www.", "")
 path=path.replace(baseUrl, "")
 path=downloadDirectory+path
 directory=os.path.dirname(path)
 
 if not os.path.exists(directory):
  os.makedirs(directory)
 return path
 
html = urlopen("http://www.pythonscraping.com")
bsObj=BeautifulSoup(html, "html.parser")
downloadList = bsObj.findAll(src=True)

for download in downloadList:
 fileUrl = getAbsoluteURL(baseUrl, download["src"])
 if fileUrl is not None:
  print(fileUrl)

urlretrieve(fileUrl, getDownloadPath(baseUrl, fileUrl, downloadDirectory))

会在当前文件创建一个downloaded 目录 /downloaded/img 抓取下来的图片保存在这里。

Python 连接数据库

1.需要安装mysql ，成功安装后。

2.安装pymysql 模块

可以直接使用pip包管理工具

pip3 install pymysql

或者使用github上的

curl -L https://github.com/PyMySQL/PyMySQL/tarball/pymysql-0.6.2 | tar xz

cd PyMySQL-PyMySQL-f953785

python3 setup.py install（如果用的是python2.x 把3去掉）

sudo mysql -uroot -p

进入数据库后

create database scraping;

use scraping;

create table pages (id BIGINT(7) not null AUTO_INCREMENT, title varchar(200), content varchar(10000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, primary key(id));

import pymysql

conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='000000', db='mysql')
cur = conn.cursor()
cur.execute("USE scraping")
cur.execute("SELECT *FROM pages where id=1")
print(cur.fetchone())
cur.close()
conn.close()

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random
import pymysql

conn = pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='000000', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE scraping")
random.seed(datetime.datetime.now())

def store(title, content):
 cur.execute("INSERT INTO pages (title, content) VALUES (\"%s\",\"%s\")",(title, content))
 cur.connection.commit()

def getLinks(articleUrl):
 html = urlopen("https://en.wikipedia.org"+articleUrl)
 bsObj=BeautifulSoup(html, "html.parser")
 title = bsObj.find("h1").get_text()
 content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()
 store(title, content)
 return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))

links = getLinks("/wiki/Kevin_Bacon")
try:
 while len(links)>0:
  newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
  print(newArticle)
  links = getLinks(newArticle)
finally:
  cur.close()
  conn.close()

CREATE DATABASE wikipedia;

use wikipedia;

CREATE TABLE pages( id INT NOT NULL AUTO_INCREMENT, url varchar(255) not NULL, created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id));

CREATE TABLE link( id INT NOT NULL AUTO_INCREMENT, fromPageId INT NULL, toPageId INT NULL, created TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id));

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import pymysql
conn=pymysql.connect(host='127.0.0.1', unix_socket='/tmp/mysql.sock', user='root', passwd='000000', db='mysql', charset='utf8')
cur = conn.cursor()
cur.execute("USE wikipedia")

def insertPageIfNotExists(url):
 cur.execute("SELECT *FROM pages WHERE url = %s", (url))
 if cur.rowcount == 0:
  cur.execute("INSERT INTO pages (url) VALUES  (%s)", (url))
  conn.commit()
  return cur.lastrowid
 else:
  return cur.fetchone()[0]

def insertLink(fromPageId, toPageId):
 cur.execute("SELECT * FROM link WHERE fromPageId = %s AND toPageId = %s", ((fromPageId,(toPageId))))
 if cur.rowcount == 0:
   cur.execute("INSERT INTO link (fromPageId, toPageId) VALUES (%s, %s)", ((fromPageId, (toPageId))))
   conn.commit()

pages = set()
def getLinks(pageUrl, recursionLevel):
 global pages
 if recursionLevel > 4:
  return ;
 pageId = insertPageIfNotExists(pageUrl)
 html = urlopen("https://en.wikipedia.org"+pageUrl)
 bsObj = BeautifulSoup(html, "html.parser")
 for link in bsObj.findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")):
  insertLink(pageId, insertPageIfNotExists(link.attrs['href']))
  if link.attrs['href'] not in pages:
   newPage = link.attrs['href']
   pages.add(newPage)
   getLinks(newPage, recursionLevel+1)
getLinks("/wiki/Kevin_Bacon", 0)
cur.close()
conn.close()

打开pdf文件

from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from io import open
    
def readPDF(pdfFile):
 rsrcmgr = PDFResourceManager()
 retstr = StringIO()
 laparams = LAParams()
 device = TextConverter(rsrcmgr, retstr, laparams=laparams)
 process_pdf(rsrcmgr, device, pdfFile)
 device.close()
 content = retstr.getvalue()
 retstr.close()
 return content

pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
outputString = readPDF(pdfFile)
print(outputString)
pdfFile.close()

提交文件：

import requests
files = {'uploadFile': open('./files/test.png','rb')}
r=requests.post("http://pythonscraping.com/pages/processing2.php", files=files)
print(r.text)

提交表单的三种方法：

#python_http.py
import requests

from requests.auth import AuthBase
from requests.auth import HTTPBasicAuth
auth = HTTPBasicAuth('ryan', 'password')
r=requests.post(url="http://pythonscraping.com/pages/auth/login.php", auth=auth)
print(r.text)

#python_requests.py
import requests

params = {'username':'Ryan', 'password': 'password'}
r = requests.post("http://pythonscraping.com/pages/cookies/welcome.php", params)
print("Cookie is set to:")
print(r.cookies.get_dict())
print("--------------")
print("Going to profile page...")
r=requests.get("http://pythonscraping.com/pages/cookies/profile.php", cookies=r.cookies)
print(r.text)

#python_session.py
import requests

session = requests.Session()

params = {'username':'username', 'password':'password'}
s=session.post("http://pythonscraping.com/pages/cookie/welcome.php", params)
print("Cookie is set to:")
print(s.cookies.get_dict())
print("---------")
print("Going to profile page...")
s=session.get("http://pythonscraping.com/pages/cookies/profile.php")
print(s.text)

伪造http的头部

import requests
from bs4 import BeautifulSoup

session = requests.Session()
headers={"User-Agent":"Mozillla/5.0 (Macintosh;Intel Mac OS X 10_9_5)AppleWebKit 537.36 (KHTML, like Gecko) Chrome", "Accept":"text/html, application/xhtml+xml, application/xml;q=0.9,image/webp,*/*;q=0.8"}
url="https://www.whatismybrowser.com/developers/what-http-headers-is-my-browser-sending"
req=session.get(url, headers=headers)
bsObj=BeautifulSoup(req.text, "html.parser")
print(bsObj.find("table", {"class":"table-striped"}).get_text)