Python爬虫(一)：爬取单页面

最新推荐文章于 2024-09-25 11:15:45 发布

sue35

最新推荐文章于 2024-09-25 11:15:45 发布

阅读量2.6k

点赞数

分类专栏： Python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/u012901209/article/details/77803203

版权

Python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

   一直都很想学习Python爬虫，正好这学期有项目需求，可以有针对性地好好学习一下，并在这里记录自己的学习之路~


# coding: UTF-8
import requests
import csv
from bs4 import BeautifulSoup

def getWebContent(url, data = None):
    header = {
        'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
        'Connection': 'Keep-Alive',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
    }
    rep = requests.get(url, headers = header)
    rep.encoding = 'utf-8'
    return rep.text

def getData(html_text):
    res = []
    bs = BeautifulSoup(html_text, "html.parser")
    body = bs.body
    data = body.find('div', {'class' : 'content'})
    dl = data.find('dl')
    dt = dl.find_all('dt')
    ul = dl.find_all('ul')

    for category in dt:
        res.append([category.string])

    index = -1
    for count in ul:
        index += 1
        li = count.find_all('li')
        total = 0
        for num in li:
            total += int(num.find('em').string[1:-1])
        res[index].append(total)
    return res

def writeFile(data, fileName):
    with open(fileName, 'a', errors = 'ignore', newline = '') as f:
        f_csv = csv.writer(f)
        f_csv.writerows(data)

if __name__ == '__main__':
    url = 'http://www.dir001.com/category'
    html = getWebContent(url)
    result = getData(html)
    writeFile(result, 'F:/total.csv')