文章目录
七、Python简单爬虫
1.重要知识与技能
重要知识与技能:
- 使用HTML与CSS制作网页文件
- 使用re正则表达式抓取网页文件
- 使用requests获取网站内容
- 使用re正则表达式提取数据
- 使用xPath工具提取数据
- 使用BeautifulSoup工具
2.使用re表达式抓取网页文件
import re
myFile = open('Index.html','r',encoding='UTF-8')
myContent = myFile.read()
myFile.close()
#myPatten = "<li>(.*)</li>"
myPatten2 = "([a-zA-Z0-9_\.-]+@[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)+)"
mylist = re.findall(myPatten2,myContent)
print(mylist)
3.使用requests抓取网页
import requests
myURL = 'https://www.3dmgame.com'
myContent = requests.get(myURL).content.decode('UTF-8')
4.使用re正则表达式提取数据
def Get3DMNews_WithRE():
'''
得到3DM网站的新闻内容
:return: 获取的新闻内容
'''
import requests
import re
myURL = 'https://www.3dmgame.com'
myContent = requests.get(myURL).content.decode('UTF-8')
myPartten = '<a href="(.*)" target="_blank" >(.*)</a>\n <span>(.*)</span>'
myList = re.findall(myPartten,myContent)
for item in myList :
myNews = {}
myNews['title'] = item[0]
myNews['herf'] = item[1]
myNews['time'] = item[2]
print(myNews)
pass
pass
5.使用xPath工具提取数据
def Get3DMNews_WithXPATH():
'''
得到3DM网站的新闻内容
:return: 获取的新闻内容
'''
import requests
from lxml import html
myURL = 'https://www.3dmgame.com'
myContent = requests.get(myURL).content.decode('UTF-8')
etree = html.etree
eTreeHtml = etree.HTML(myContent)
myList = eTreeHtml.xpath("//li")
for item in myList :
myNews = {}
myNews['title'] = item.xpath('./a')[0].text
myNews['herf'] = item.xpath('./a/@href')[0]
myNews['time'] = item.xpath('./span')[0].text
print(myNews)
pass
pass
6.使用BeautifulSoup工具
def Get3DMNews_WithBeautifulSoup():
'''
得到3DM网站的新闻内容
:return: 获取的新闻内容
'''
import requests
from bs4 import BeautifulSoup
myURL = 'https://www.3dmgame.com'
myContent = requests.get(myURL).content.decode('UTF-8')
bsHtml = BeautifulSoup(myContent,'html5lib')
myList = bsHtml.find_all('div')[10].find_all('div')[8].find_all('div')[91].find_all('li')
for item in myList :
myNews = {}
myNews['title'] = item.find('a').get_text()
myNews['herf'] = item.find('a').get('href')
myNews['time'] = item.find('span').get_text()
print(myNews)
pass
pass