1:首先确定要爬取的网站:爬取的url链接规律,请求方式时post还是get,
2:然后简单书写爬虫进行网页测试:
import requests from requests.exceptions import RequestException def get_one_page(url): headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)" } try: response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: return None def main(): url = 'http://maoyan.com/board/4?' html = get_one_page(url) print(html) if __name__ =="__main__": main()
3:测试通过后,增加网页循环对爬取内容进行处理,然后方法一保存为txt格式,方法二保存为csv格式:
# !/usr/bin/env python # -*- coding:utf-8 -*- import requests import re import time import json from requests.exceptions import RequestException def get_one_page(url): headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)" } try: response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: return None # 定义parse_one_page,对html进行解析,re.S表示匹配任何非空白字符,其中(.*?)表示匹配的内容: def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) #html表示匹配的目标对象 items= re.findall(pattern, html) for item in items: yield { "index":item[0], "image":item[1], "name":item[2], "actor":item[3].strip(), "time":item[4].strip(), "star":item[5]+item[6], } def main(offset): url = 'http://maoyan.com/board/4?offset='+ str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) # write_to_file(item) write_to_csv(item) def write_to_csv(content): with open("猫眼result.csv",'a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) +'\n') # def write_to_file(content): # # a表示追加的方式进行添加 # with open('猫眼result.txt', 'a', encoding='utf-8') as f: # f.write(json.dumps(content, ensure_ascii=False) + '\n') if __name__ =="__main__": for i in range(10): main(offset = i * 10) time.sleep(1)
使用进程池抓取:
# !/usr/bin/env python # -*- coding:utf-8 -*- # !/usr/bin/env python # -*- coding:utf-8 -*- import requests import re import time import json from multiprocessing import Pool from requests.exceptions import RequestException def get_one_page(url): headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)" } try: response = requests.get(url,headers = headers) if response.status_code == 200: return response.text return None except RequestException: return None # 定义parse_one_page,对html进行解析,re.S表示匹配任何非空白字符,其中(.*?)表示匹配的内容: def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S) #html表示匹配的目标对象 items= re.findall(pattern, html) for item in items: yield { "index":item[0], "image":item[1], "name":item[2], "actor":item[3].strip(), "time":item[4].strip(), "star":item[5]+item[6], } def main(offset): url = 'http://maoyan.com/board/4?offset='+ str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) # write_to_file(item) write_to_csv(item) def write_to_csv(content): with open("猫眼进程result.csv",'a',encoding='utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) +'\n') # def write_to_file(content): # # a表示追加的方式进行添加 # with open('猫眼result.txt', 'a', encoding='utf-8') as f: # f.write(json.dumps(content, ensure_ascii=False) + '\n') if __name__ =="__main__": pool = Pool() pool.map(main,[i*10 for i in range(10)])