Python爬虫简单模版
#导入模块
import requests
from lxml import etree
import json
def getOnePage(url):
"得到一页信息"
header = {"User-Agent":"Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/70.0.3538.110 Safari/537.36"}
html = requests.get(url,headers=header)
return html.text
def parseOnPage(text):
html = etree.HTML(text)
#电影名
name = html.xpath("//p[@class='name']/a/text()")#并非唯一
#主演
star = html.xpath("//p[@class='star']/text()")
#上映时间
releasetime = html.xpath("//p[@class='releasetime']/text()")
for item in range(len(name)):
yield{
"index":item,
"name":name[item],
"star":star[item].strip(),
"releasetime":releasetime[item]
}
def wirte2File(content):
#路径需要自己重新定义
withopen(r"C:\Users\jiangxian\Desktop\爬虫\maoyan666.txt",'a',encoding='utf-8') as fp:
fp.write(json.dumps(content,ensure_ascii=False)+'\n')
def main():
for offset inrange(10):
url = "https://maoyan.com/board/4?offset={}".format(offset*10)
text = getOnePage(url)
for item in parseOnPage(text):
wirte2File(item)
print(item)
main()