python_爬虫爬取网站文本信息

第一步获取网页信息

#导入包
import requests
from requests.exceptions import ConnectionError
#定义函数获取网页
def get_one_page(url):
	try:
		response = requests.get(url)
		if response.status_code == 200:
			return response.text
		return None
	except ConnectionError:
		return None

def main():
	url = 'http://movie.douban.com/top250'
	html = get_one_page(url)
	print(html)

if __name__ == '__main__':
	main()

第二步获取过滤网页信息

import requests
from requests.exceptions import ConnectionError#引用异常处理模块
import re#正则表达式模块

def get_one_page(url):
	try:
		response = requests.get(url)
		if response.status_code == 200:
			return response.text
		return None
	except ConnectionError:
		return None

def parse_one_page(html):#使用正则表达式过滤信息
    pattern = re.compile('<em class="">(\d+)</em>.*?<sp.*?">(.*?)</span>',re.S)
    items = re.findall(pattern,html)
    print(items)

def main():
	url = 'http://movie.douban.com/top250'
	html = get_one_page(url)
	parse_one_page(html)

if __name__ == '__main__':
	main()

第三步信息写入txt文本

import requests
from requests.exceptions import ConnectionError
import re
import json

def get_one_page(url):
    try:
            response = requests.get(url)
            if response.status_code == 200:
                    return response.text
            return None
    except ConnectionError:
            return None

def parse_one_page(html):
    pattern = re.compile('<em class="">(\d+)</em>.*?<sp.*?">(.*?)</span>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            '排名': item[0],
            '名称': item[1],
            }
def write_to_file(content):
    with open('result.txt', 'a',encoding = 'utf-8') as f:
        f.write(json.dumps(content,ensure_ascii = False) + '\n')
        f.close()

def main():
    url = 'http://movie.douban.com/top250'
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
	main()

开启多线程爬取前250电影个内容

import requests
from multiprocessing import Pool
from requests.exceptions import ConnectionError
import re
import json

def get_one_page(url):
    try:
            response = requests.get(url)
            if response.status_code == 200:
                    return response.text
            return None
    except ConnectionError:
            return None

def parse_one_page(html):
    pattern = re.compile('<em class="">(\d+)</em>.*?<sp.*?">(.*?)</span>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            '排名': item[0],
            '名称': item[1],
            }
def write_to_file(content):
    with open('result.txt', 'a',encoding = 'utf-8') as f:
        f.write(json.dumps(content,ensure_ascii = False) + '\n')
        f.close()

def main(num):
    url = 'http://movie.douban.com/top250?start='+ str(num) +'&filter='
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    pool = Pool()
    pool.map(main, [i*25 for i in range(10)])

PS:网站可能会有更新,有问题可以评论。

展开阅读全文

没有更多推荐了,返回首页