一直都很想学习Python爬虫,正好这学期有项目需求,可以有针对性地好好学习一下,并在这里记录自己的学习之路~
import requests
import csv
from bs4 import BeautifulSoup
def getWebContent(url, data = None):
header = {
'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
}
rep = requests.get(url, headers = header)
rep.encoding = 'utf-8'
return rep.text
def getData(html_text):
res = []
bs = BeautifulSoup(html_text, "html.parser")
body = bs.body
data = body.find('div', {'class' : 'content'})
dl = data.find('dl')
dt = dl.find_all('dt')
ul = dl.find_all('ul')
for category in dt:
res.append([category.string])
index = -1
for count in ul:
index += 1
li = count.find_all('li')
total = 0
for num in li:
total += int(num.find('em').string[1:-1])
res[index].append(total)
return res
def writeFile(data, fileName):
with open(fileName, 'a', errors = 'ignore', newline = '') as f:
f_csv = csv.writer(f)
f_csv.writerows(data)
if __name__ == '__main__':
url = 'http://www.dir001.com/category'
html = getWebContent(url)
result = getData(html)
writeFile(result, 'F:/total.csv')