Python爬取码市项目
上周用java爬取了码市上的项目,这周用python试了一下,代码如下。
import requests
import brotli
from requests.exceptions import RequestException
import time
import csv
import json
headers = {'Accept': 'application/json'}
def get_one_page(url):
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return data
return None
except RequestException:
print("抓取失败")
def parse_one_page(d):
try:
rewards = d['rewards']
for reward in rewards:
yield{
'id': reward['id'],
'name': reward['name'],
'price': reward['price'],
'statusText': reward['statusText'],
'description': reward['description'],
'duration': reward['duration'],
'roles': reward['roles']
}
if data['id', 'name', 'price', 'statusText', 'description', 'duration', 'roles'] is None:
return None
except Exception:
return None
def main():
for i in range(10):
url = 'https://codemart.com/api/project?page={}'.format(i+1)
d = get_one_page(url)
print('第{}页抓取完毕'.format(i+1))
# 将输出字典依次写入csv文件中
with open('Jobs.csv', 'a', newline='', encoding='utf-8') as f:
fieldnames = ['id', 'name', 'price', 'statusText', 'description', 'duration', 'roles']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for item in parse_one_page(d):
writer.writerow(item)
if __name__ == '__main__':
main()
跑完之后出CSV文件,代码量上少了很多,时间上与java花的差不多(可能是数据量不是特别多的原因,所以差别不是特别明显( ̄▽ ̄)")。