.*?(.*?)'
+ '.*?class="IcoList">(.*?).*?class="IcoTime">(.*?)', re.S)
items = re.findall(pattern, html) # 利用re库的compile方法构造正则表达式,findall方法获取items
for item in items:
yield {
'名称': item[2],
'壁纸': item[1],
'网址': item[0],
'发布日期': item[4],
'查看次数': item[3][3:]
} # yield生成器,被调用时才赋值
content = get_image_content(item[1])
download_image(content)
def save_to_file(filename,file_type,text): # 保存至本地
with open('{}{}{}'.format(path_txt, filename,file_type), 'a', encoding='utf-8', )as wf: # 以utf-8的编码方式追加到文件
wf.write(json.dumps(text, ensure_ascii=False) + '\n') # 解码相关json格式
print(text,'写入到本地成功!')
wf.close()
def save_to_mongo(text): # 存储到MongoDB
if db[mongo_table].insert(text):
print(text,'写入Mongo成功!')
return True
return False
def main(filename, page):
url = 'http://www.***/bizhitupian/meinvbizhi/{}.htm'.format(page)
html = get_responses(url)
items = get_url_items(html)
for item in items:
save_to_file(filename,file_type, item)
save_to_mongo(item)
if __name__ == '__main__':
for page in range(start_page, end_page + 1):
main(file, page)
time.sleep(15)#等待15秒,防止被识别