#导入模块
import requests
from lxml import etree
import json
def getOnePage(url):
"得到一页信息"
header = {"User-Agent":"MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400"}
html = requests.get(url,headers=header)
return html.text
def parseOnPage(text):
html = etree.HTML(text)
#电影名
name = html.xpath("//p[@class='name']/a/text()")
#主演
star = html.xpath("//p[@class='star']/text()")
#上映时间
releasetime = html.xpath("//p[@class='releasetime']/text()")
for item in range(len(name)):
yield{
"index":item,
"name":name[item],
"star":star[item].strip(),
"releasetime":releasetime[item]
}
def wirte2File(content):
#路径需要自己重新定义
with open(r"C:\Users\john\Desktop\爬虫.txt",'a',encoding='utf-8') as fp:
fp.write(json.dumps(content,ensure_ascii=False)+'\n')
def main():
for offset in range(10):
url = "https://maoyan.com/board/4?offset={}".format(offset*10)
text = getOnePage(url)
for item in parseOnPage(text):
wirte2File(item)
print(item)
main()
单项爬虫测试
#导入模块
import requests
from lxml import etree
import json
url=“https://maoyan.com/cinemas”
header = {“User-Agent”:“MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400”}
html = requests.get(url,headers=header)
test=html.text
html = etree.HTML(test)
addr = html.xpath("//div[@class=‘cinema-info’]/a/text()")#并非唯一
print(addr)
xpath路径测试
//*[@id=“content”]/div/div[1]/ol/li[4]/div/div[2]/div[1]/a/span[1]
豆瓣
addr = html.xpath("//div[@class=‘hd’]/a/span[1]/text()")#并非唯一
html.xpath("//div[@class=‘bd’]/p/text()")
//[@id=“content”]/div/div[1]/ol/li[4]/div/div[2]/div[2]/p[1]
//[@id=“content”]/div/div[1]/ol/li[4]/div/div[2]/div[2]/div/span[2]
html.xpath("//li/div/div[2]/div[2]/div/span[2]/text()")
//*[@id=“content”]/div/div[1]/ol/li[2]/div/div[1]/em
#导入模块
import requests
from lxml import etree
import json
def getOnePage(url):
"得到一页信息"
header = {"User-Agent":"MMozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400"}
html = requests.get(url,headers=header)
return html.text
def parseOnPage(text):
html = etree.HTML(text)
#电影名
name = html.xpath("//div[@class='exotic item-name']/a/text()")#并非唯一
for item in range(len(name)):
yield{
"index":item,
"name":name[item],
}
def wirte2File(content):
#路径需要自己重新定义
with open(r"D:\爬虫.txt",'a',encoding='utf-8') as fp:
fp.write(json.dumps(content,ensure_ascii=False)+'\n')
def main():
for offset in range(10):
url = "https://www.light.gg/db/category/1/weapons/?page={}".format(offset)
text = getOnePage(url)
for item in parseOnPage(text):
wirte2File(item)
print(item)
main()
import requests
from lxml import etree
import json
def getOnePage(url):
header = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
html = requests.get(url,headers=header)
return html.text
def parseOnPage(text):
html = etree.HTML(text)
name = html.xpath("//p[@class='name']/a/text()")
star = html.xpath("//p[@class='star']/text()")
releasetime = html.xpath("//p[@class='releasetime']/text()")
for item in range(len(name)):
yield{
"index":item,
"name":name[item],
"star":star[item].strip(),
"releasetime":releasetime[item]
}
def write2File(content):
with open(r"C:\Users\john\Desktop\爬虫.txt",'a',encoding='utf-8') as fp:
fp.write(json.dumps(content,ensure_ascii=False)+'\n')
def main():
for offset in range(10):
url = "https://maoyan.com/board/4?offset={}".format(offset*10)
text = getOnePage(url)
for item in parseOnPage(text):
write2File(item)
print(item)
main()