1.python菜鸟教程100道例题。
代码(1):
#!/usr/bin/python
# coding:utf-8
import json
import re
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
datalist = []
results = re.findall('<p><strong>(.*?)</strong>(.*?)</p>', html, re.S)
for i in results:
for j in i:
datalist.append(j)
pattern = re.compile('<div class="hl-main">(.*?)</div>', re.S)
results = re.findall(pattern, html)
for result in results:
soup = BeautifulSoup(result,'lxml')
datalist.append(soup.get_text())
soup = BeautifulSoup(html,'lxml')
for pre in soup.select('pre')[0:1]:
datalist.append(pre.get_text())
return datalist
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
for i in content:
f.write(i)
f.write('\n')
def main(offest):
url = 'http://www.runoob.com/python/python-exercise-example' + str(offest) + '.html'
html = get_one_page(url)
data = parse_one_page(html)
write_to_file(data)
if __name__ == '__main__':
for i in range(1,101):
main(offest=i)
print("正在下载第" + str(i) + "道题。。。。。。")
代码(2):
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from lxml import etree
def geu_page(url):
try:
res = requests.get(url,timeout=4)
res.encoding = 'utf-8'
if res.status_code == 200:
html = res.text
return html.encode("utf-8")
except Exception as e:
for i in range(3):
print(url,e)
res = requests.get(url,timeout=4)
res.encoding = 'utf-8'
if res.status_code == 200:
html = res.text
return html.encode('utf-8')
def get_index(url):
html = geu_page(url)
html = BeautifulSoup(html,'lxml')
datas = html.find_all('ul')
data = datas[2]
data = BeautifulSoup(str(data),'lxml')
for urls in data.find_all('a'):
yield 'http://www.runoob.com' + urls.get('href')
def get_data(url):
html = geu_page(url)
doc = pq(html)
datas = etree.HTML(html)
title = doc('#content h1').text()
print('正在下载'+":"+title)
data = doc('#content p')
name = pq(data[1]).text()
num = pq(data[2]).text()
n = data[3].text
data = datas.xpath('//div[@class="hl-main"]/span/text()')
code = ''.join(data)
with open(r'pythpn习题100例.txt','a+',encoding='utf-8') as f:
f.write(title+'\n')
f.write(name+'\n')
f.write(num+'\n')
f.write(n+'\n')
f.write(code)
f.write('\r\n')
def main():
url = r'http://www.runoob.com/python/python-100-examples.html'
for i in get_index(url):
get_data(i)
if __name__ == '__main__':
main()
2.猫眼电影top100。
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import json
from multiprocessing import Pool
import requests
from requests.exceptions import RequestException
from multiprocessing import Pool
import re
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except Exception as e:
print(e)
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index':item[0],
'image':item[1],
'title':item[2],
'actor':item[3].strip()[3:],
'time':item[4].strip()[5:],
'score':item[5]+item[6]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False) + '\n')
f.close()
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
# for i in range(10):
# main(i*10)
pool = Pool()
pool.map(main,[i*10 for i in range(10)]) #生成器生成参数
3.今日头条图集抓取。
代码:
#!/usr/bin/python
#coding:utf-8
import os
import json
import re
import pymongo
import requests
from config import *
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from urllib.parse import urlencode
from hashlib import md5
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offest,keyword):
data = {
'offset': offest,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 1,
'from': 'search_tab',
'pd': 'synthesis'
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
try:
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求索引页出现错误")
return None
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
def get_page_detail(url):
try:
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求详情页出现错误",url)
return None
def parse_page_datail(html,url):
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
print(title)
images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)
result = re.search(images_pattern,html)
if result:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images:
download_image(image)
return {
'title': title,
'url':url,
'images': images
}
def download_image(url):
print('正在下载',url)
try:
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': 'tt_webid=6607376733821126151; tt_webid=6607376733821126151; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6607376733821126151; UM_distinctid=1662fc5d1b478-04aa6684af3777-8383268-1fa400-1662fc5d1b556c; csrftoken=0e079708e36d9c1eeea96125f6b6309a; uuid="w:17e8c76a5628443999604cfc1482b920"; ccid=fba911a3338ceafebd52015ebe3fb4a9; CNZZDATA1259612802=1051770912-1538395942-https%253A%252F%252Fwww.google.com.hk%252F%7C1538488870; __tasessionId=g87q247qw1538490746687',
'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
save_image(response.content)
return None
except RequestException:
print("请求图片出错",url)
return None
def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(content)
f.close()
def save_to_mongo(result):
"""存储文件到数据库"""
if db[MONGO_DB].insert(result):
print('存储成功', result)
return True
return False
def main(offset):
html = get_page_index(offset,KEY)
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_datail(html,url)
if result:
save_to_mongo(result)
if __name__ == '__main__':
groups = [x * 20 for x in range(Group_start,Group_end + 1)]
pool = Pool()
pool.map(main,groups)
配置文件:
MONGO_URL = '127.0.0.1'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'
Group_start = 1
Group_end = 20
KEY = '詹姆斯'