今天临时需要爬取一些双语资料
(尚未清洗)
需要充分利用
下边代码是想拿到Chinadaily网页中每篇双语新闻的链接,首先研究这些网页的网址和网页结构,包括翻页一般是首页网址加上_2,_3...等等。所以以下代码只是拿到链接。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
File: bi_news.py
Author: ZhangHaiou(hozhangel@126.com)
Date: 2018/05/04
"""
import urllib
import re
import os
bi_urls = []
def getHtml(url): #读取网页内容
page = urllib.urlopen(url)
html = page.readlines()
#print html
return html
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
def geturl(html): #读取网页中需要的链接
for line in html:
if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line):
if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line): #只是想拿到2016年之后的语料
os._exit(0)
else:
url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line)
print("http://language.chinadaily.com.cn/" + url[0])
bi_urls.append("http://language.chinadaily.com.cn/" + url[0])
if __name__ == '__main__':
n = 1
# os.system('wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html')
# #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html"))
# '''
while n:
if(n < 2):
html = getHtml("http://language.chinadaily.com.cn/news_bilingual.html")
elif(n > 1):
html = getHtml("http://language.chinadaily.com.cn/news_bilingual_" + str(n) + ".html" )
geturl(html)
n = n + 1
执行python bi_news.py >url.txt 把想要的网址保存
url.txt内容:
下一步是简单爬取把url中每行链接的网页内容,且把新闻按照月份整理进入文件夹,文件名是每个新闻链接的后面八位数字
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
File: content.py
Author: ZhangHaiou(hozhangel@126.com)
Date: 2018/05/04
"""
import urllib
import re
import os
import sys
bi_urls = []
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
#print html
return html
def getImg(html):
reg = r'src="(.+?\.jpg)" pic_ext'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1
def geturl(html):
for line in html:
if re.search('\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm"',line):
if re.search('\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm"',line):
os._exit(0)
else:
url = re.findall(r'\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm',line)
print(url)
bi_urls.append(url)
def savefile(savepath, content):
with open(savepath, "w") as fp:
fp.write(content)
if __name__ == '__main__':
for line in open(sys.argv[1],'r'):
content = ""
n = 1
while n: #这个循环是为了不遗漏需要翻页的新闻
if n > 1:
htm = line + "_" + str(n)
else:
htm = line
raw = getHtml(htm)
if not re.findall(r'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',raw): #避免空白页
break
print(htm)
n = n + 1
# for hang in raw:
# if re.search('^\<p\>.*\<\/p\>',hang):
content = content + raw
date = re.findall(r'\d\d\d\d\-\d\d',line)[0]
filename = re.findall(r'\d{6,}',line)[0]
if not os.path.exists(date): # 是否存在目录
os.makedirs(date)
savefile(date + "/" + filename + ".txt" , content)