【数据采集】第三次作业_return [str(item.text).strip() for item in bs.sele-CSDN博客

本文链接：https://blog.csdn.net/Miaisma/article/details/115707688

（1）爬取34个省区市所有的省-市-区-镇/乡名称

作业要求：

去中国国家统计网爬取中国现在34个省区市所有的省-市-区-镇/乡名称，即行政区划最小到镇或乡村或农场一级。并将爬取结果按各省或直辖市保存到文本文件中，每个省或直辖市一个文件。

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html

import requests
from bs4 import BeautifulSoup
import re

def getHTML(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
    r = requests.get(url, headers = headers)
    r.encoding = 'gbk'
    return r.text

def getNextURL(url, rank):
    dic = {}
    HTML = getHTML(url)
    bs = BeautifulSoup(HTML, 'html.parser')
    for item in bs.find_all('tr',class_= rank+'tr'):
        if item.a!=None:
            dic[item.text] ='/'.join(url.split('/')[:-1])+'/'+item.a.attrs['href']
    return dic

index = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'

city={}
bs = BeautifulSoup(getHTML(index), 'html.parser')
for item in bs.find_all('a'):
    city[item.text] = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'+item.attrs['href']
city.pop('京ICP备05034670号')

for url1 in city.values():
    with open(list(city.keys())[list(city.values()).index(url1)]+'.txt','w') as f:
        f.write(list(city.keys())[list(city.values()).index(url1)]+'\n')
        print('正在写入'+list(city.keys())[list(city.values()).index(url1)])
        county = getNextURL(url1, 'city')
        for url2 in county.values():
            f.write(list(county.keys())[list(county.values()).index(url2)]+'\n')
            town = getNextURL(url2, 'county')
            for url3 in town.values():
                f.write(list(town.keys())[list(town.values()).index(url3)]+'\n')
                village = getNextURL(url3, 'town')
                if village!=None:
                    for url4 in village.values():
                        f.write(list(village.keys())[list(village.values()).index(url4)]+'\n')
                        pattern = '<tr.+villagetr.+<td>(.+)</td><td>(.+)</td><td>(.+)</td></tr>'
                        p = re.compile(pattern)
                        sp = BeautifulSoup(getHTML(url4), 'html.parser')
                        for i in sp.find_all('tr',class_= 'villagetr'):
                            for j in p.findall(str(i)):
                                f.write(j[0]+'\t'+j[1]+'\t'+j[2]+'\n')
        f.close()

print('ok.')

（1）爬取豆瓣小说前10页所有小说的信息

作业要求：

爬取豆瓣网小说网页前10页所有小说的信息，包括每一部小说的如下信息（作者、小说名、出版社、出版日期、价格、评分、评分人数、内容简介），并将这些信息存储到数据库中。数据库不限于sqlite，一部小说一条记录。最后需要实现查询某部小说信息的功能。

https://book.douban.com/tag/小说

import requests
from bs4 import BeautifulSoup
import sqlite3

conn = sqlite3.connect('douban.db')
sql_tables = "create table noval(id INTEGER primary key autoincrement, author text, title text, press text, date text, price text, score text, people text, content text )"
conn.execute(sql_tables)
conn.commit()

for index in range(0,200,20):
    url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start='+str(index)+'&type=T'
    print('正在准备爬取第'+str(index//20+1)+'页')
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}
    html = requests.get(url,headers = headers).text
    bs = BeautifulSoup(html,'html.parser')
    pub = bs.select('div.pub')
    author = [item.text.split('/')[0].strip() for item in pub]
    title = [item.text.strip() for item in bs.select('h2')]
    press = [item.text.split('/')[-3].strip() for item in pub]
    date = [item.text.split('/')[-2].strip() for item in pub]
    price = [item.text.split('/')[-1].strip() for item in pub]
    score = [item.text for item in bs.select('span.rating_nums')]
    people = [item.text.strip() for item in bs.select('span.pl')]
    content = [item.text for item in bs.select('div.info p')]
    for i in range(20):
        if (index==60 and i==4) or (index==140 and i==2) or index==140 and i==10:
            content.insert(i,'')
        sql = "insert into noval values(null, '{}','{}','{}','{}','{}','{}','{}','{}')".format(author[i],title[i],press[i],date[i],price[i],score[i],people[i],content[i])
        conn.execute(sql)
        conn.commit()
print('ok')

sql_select = 'select * from noval'
cursor = conn.cursor()
cursor.execute(sql_select)
conn.commit()
result = cursor.fetchall()
print(result)

sql = 'select title from noval1 where author="余华"'
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()