1 import requests
2 from requests.exceptions import RequestException
3 import re
4 import json
5
6 def get_one_page(url):
7 try:
8 headers = {
9 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/'
10 + '535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'
11 }
12 response = requests.get(url, headers=headers)
13 response.encoding = 'gb2312'
14 if response.status_code == 200:
15 return response.text
16 return None
17 except RequestException:
18 return None
19
20 def parse_one_page(html):
21 pattern = re.compile(
22 '<li>.*?target.*?src="(.*?)".*?<h2>.*?title.*?>(.*?)</a>'
23 + '</h2>.*?</span><span>(.*?)</span>.*?>(.*?)</p>.*?</li>', re.S)
24 items = re.findall(pattern, html)
25 #print(items)
26 for item in items:
27 yield {
28 'image': item[0],
29 'title': item[1],
30 'type': item[2],
31 'introduction': item[3]
32 }
33
34 def write_to_file(content):
35 with open('Yinghua.json', 'a', encoding='utf-8') as f:
36 f.write(json.dumps(content, ensure_ascii=False) + '\n')
37
38 def main(page):
39 num = (page - 1) * 12 + 1
40 url = '×××page=' + str(page) + '×××'
41 html = get_one_page(url)
42 #print(html)
43 for item in parse_one_page(html):
44 print(num)
45 print(str(item) + '\n')
46 item = str(num) + str(item)
47 write_to_file(item)
48 num += 1
49
50 if __name__ == '__main__':
51 for i in range(1, 398):
52 main(page=i)