爬虫作业-

task1:

import requests
import csv
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent':user_agent}
policies = requests.get("#国家政策信息网网址")
policies.encoding = policies.apparent_encoding

p = BeautifulSoup(policies.text,'html.parser')
contents = p.find_all(href=re.compile('content'))
rows = []

for content in contents:
    href = content.get('href')
    row = ('国务院', content.string, href)
    rows.append(row)

header = ['发文部门','标题','链接']
with open('policies.csv','w',encoding='gb18030') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(header)
    f_csv.writerows(rows)

print('\n\n最新的信息获取完成\n\n')

task2

import requests
import csv
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent':user_agent}
policies = requests.get("# http")
policies.encoding = policies.apparent_encoding

p = BeautifulSoup(policies.text,'html.parser')
contents = p.find_all(href=re.compile('content'))
rows = []

for content in contents:
    href = content.get('href')
    row = (content.get_text() , href)
    rows.append(row)

for i in range(5):
    print(rows[i])

# header = ['发文部门','标题','链接']
# with open('policies.csv','w',encoding='gb18030') as f:
#     f_csv = csv.writer(f)
#     f_csv.writerow(header)
#     f_csv.writerows(rows)

print('\n\n最新的信息获取完成\n\n')

task3

# task2

import requests
import csv
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent': user_agent}
policies = requests.get("# http")
policies.encoding = policies.apparent_encoding
p = BeautifulSoup(policies.text, 'html.parser')
contents = p.find_all('h4')
xuehao = []
rows = []

# for content in contents:
#     row = ('国务院', content.a.get_text(),content.span.get_text())
#     rows.append(row)
for i in range(len(contents)):
    href = contents[i].find('a').attrs['href']
    xuehao.append(href)
xuehao_list = [xuehao[i] for i in range(len(xuehao)) if '6' in str(i)]




def get_tocontent(url):
    user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
    headers = {'User-Agent': user_agent}
    policies = requests.get(url)
    policies.encoding = policies.apparent_encoding
    p = BeautifulSoup(policies.text, 'html.parser')
    news_con = p.find('table')#找到新闻内容需要信息所在的table
    if news_con is None:
        title = p.find('h1').get_text()
        time = p.find('div',class_='pages-date').get_text().split(' ')[0]
        laiyuan = p.find('span',class_='font').get_text()
        row = ('None','None',laiyuan,time,title,'None',time)
    else:
        suoyin = news_con.findAll('tr')[0].findAllNext('td')[2].get_text()
        zhuti = news_con.findAll('tr')[1].findAllNext('td')[3].get_text()
        fawen = news_con.findAll('tr')[2].findAllNext('td')[1].get_text()
        chengwen =news_con.findAll('tr')[2].findAllNext('td')[3].get_text()
        biaoti = news_con.findAll('tr')[2].findAllNext('td')[5].get_text()
        zihao = news_con.findAll('tr')[4].findAllNext('td')[1].get_text()
        fabu = news_con.findAll('tr')[4].findAllNext('td')[3].get_text()
        row = (suoyin,zhuti,fawen,chengwen,biaoti,zihao,fabu)
    global rows
    rows.append(row)
# get_tocontent('')

for i in range(len(xuehao_list)):
    if xuehao_list[i].startswith('http'):
        get_tocontent(xuehao_list[i])
    else:
        url = 'http://www.gov.cn' + xuehao_list[i]
        get_tocontent(url)
for j in range(len(rows)):
    print(rows[j])
headers = ['学号尾号为6:索引号','主题分类','发文机关','成文日期','标题','发文字号','发布日期']
with open('task2.csv','w',encoding='gb18030') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)
print('\n\n最新的信息获取完成\n\n')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,下面是一个简单的Python多线程爬虫示例: ```python import requests import threading from queue import Queue # 定义一个全局变量用来存储爬取到的数据 data = [] # 定义一个线程锁,防止多个线程同时写入数据导致数据错乱 lock = threading.Lock() # 定义一个队列,用来存储待爬取的URL url_queue = Queue() # 定义一个爬取线程类 class CrawlerThread(threading.Thread): def __init__(self, url_queue): super().__init__() self.url_queue = url_queue def run(self): while True: # 从队列中获取一个URL url = self.url_queue.get() try: # 发送请求并解析响应数据 response = requests.get(url) content = response.text # 对响应数据进行处理(此处省略) # ... # 将处理后的数据存入全局变量 with lock: data.append(processed_data) # 标记该URL已被处理 self.url_queue.task_done() except Exception as e: print(f"Error occurred while crawling {url}: {e}") # 如果发生错误,将该URL重新放回队列 self.url_queue.put(url) self.url_queue.task_done() # 定义一个入口函数 def main(): # 初始化待爬取的URL列表 urls = ["http://www.example.com/page{}".format(i) for i in range(1, 11)] # 将URL列表添加到队列中 for url in urls: url_queue.put(url) # 创建多个爬取线程并启动 for i in range(5): t = CrawlerThread(url_queue) t.start() # 阻塞主线程,直到所有URL都被处理完毕 url_queue.join() # 输出爬取结果 print(data) if __name__ == '__main__': main() ``` 以上示例中,我们定义了一个`CrawlerThread`类来表示爬取线程,定义了一个`url_queue`队列来存储待爬取的URL,定义了一个`data`列表来存储爬取到的数据,以及定义了一个`lock`线程锁来保证多个线程访问`data`时不会出现数据错乱的情况。 在`main`函数中,我们首先将待爬取的URL列表添加到`url_queue`队列中,然后创建多个`CrawlerThread`实例并启动它们。最后,我们使用`url_queue.join()`方法来阻塞主线程,直到所有的URL都被处理完毕。 在`CrawlerThread`类的`run`方法中,我们使用`self.url_queue.get()`方法从`url_queue`队列中获取一个URL,然后发送请求并解析响应数据。如果处理过程中出现错误,我们将该URL重新放回队列中,以便后续线程重新处理。如果处理成功,我们将处理后的数据存入`data`列表中,并使用`self.url_queue.task_done()`方法标记该URL已被处理完毕。 需要注意的是,在多线程爬虫中,如果爬取的网站对IP访问频率有限制,可能会导致IP被封禁的情况。因此,在实际应用中,我们需要合理设置线程数量和请求频率,以避免被封禁的情况发生。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值