这段代码处理排版比较乱的文本文件,但是我一直整理文件不成功,忘记从哪个大牛的网站上摘来的,抱歉!



module Main where

import Data.Char
import Data.List
import System.Environment
import System.FilePath
import Text.Parsec
import Text.Parsec.String

paraBeginSpace :: Int
paraBeginSpace = 2

main :: IO ()
main = do
  args <- getArgs
  let input = head args
      output = combine (takeDirectory input) "formatted.txt"
  str <- readFile input
  case (runParser file () "" str) of
    Left err -> do
         putStr "parse error at "
         print err
    Right x  -> writeFile output x
  return ()


emptyLine :: Parser String
emptyLine = do
  many (satisfy isSepSpace)
  newline
  return ""

pageLine :: Parser String
pageLine = do
  many1 (satisfy isSepSpace)
  many1 digit
  many (satisfy isSepSpace)
  newline
  return ""

isSepSpace :: Char -> Bool
isSepSpace s = isSpace s && s /= '\n'


insertSpaceBetweenEnglishWords :: [String] -> [String]
insertSpaceBetweenEnglishWords [] = []
insertSpaceBetweenEnglishWords (x:[]) = [x]
insertSpaceBetweenEnglishWords (x:y:xs)
    | isEnWord x,
      isEnWord y  = x : " " : insertSpaceBetweenEnglishWords (y:xs)
    | otherwise   = x : insertSpaceBetweenEnglishWords (y:xs)
    where
      isEnWord :: String -> Bool
      isEnWord w = and $ map (\l -> isAscii l && isLetter l) w

run :: Show a => Parser a -> String -> IO ()
run p input
        = case (parse p "" input) of
            Left err -> do{ putStr "parse error at "
                          ; print err
                          }
            Right x  -> print x

word :: Parser String
word = do
  content <- many1 (satisfy (not.isSpace ) )
  return content

beginLine :: Parser String
beginLine = do
  begin <- try (count paraBeginSpace space) <?> "段首2个空格"
  beginWord <- word <?> "段首2个空格"
  many $ satisfy isSepSpace
  lineContents <- word `sepBy` satisfy isSepSpace
  newline
  return $ concat $ insertSpaceBetweenEnglishWords
             $ begin : beginWord : lineContents

contentLine :: Parser String
contentLine = do
  lineContents <- word `sepEndBy1` many (satisfy isSepSpace)
  newline
  return $ concat $ insertSpaceBetweenEnglishWords lineContents



brokenLine :: Parser String
brokenLine = do
  notParaBegin -- 1 or 3+ spaces
  lineContents <- word `sepEndBy1`
                  many (satisfy isSepSpace)
                  <?> "非段落起首空白数"
  newline
  many (choice [try emptyLine, try pageLine])
  lookAhead contentLine
  return $ concat
             $ insertSpaceBetweenEnglishWords lineContents

notParaBegin :: Parser String
notParaBegin = try (do
                     space
                     space
                     many1 space
                     return ""
                   ) -- >= 3 spaces
               <|> (space >> return "") -- 1 space


paragraph :: Parser String
paragraph = do
   beg  <- beginLine
   cons <- many (choice [try emptyLine
                        ,try pageLine
                        ,try contentLine
                        ,try brokenLine
                        ,try optString])
   lookAhead beginLine
            <|> lookAhead titleLine
            <|> lookAhead (eof >> return "")
   return $ concat $ beg : cons
    where
      optString :: Parser String
      optString  = do
              sp <- try (do
                          space
                          space
                          many1 space)
                    <|> (space >> return " ")
              lc <- anyChar `manyTill` (try newline)
              many (choice [try emptyLine, try pageLine])
              lookAhead notParaBegin
                   <|> (eof >> return "")
              return $ "\n" ++ sp ++ lc


titleLine :: Parser String
titleLine = do
  notParaBegin -- 1 or 3+ spaces
  lineContents <- word `sepEndBy1`
                  many (satisfy isSepSpace)
                  <?> "非段落起首空白数"
  newline
  many (choice [try emptyLine, try pageLine])
  lookAhead beginLine
  return $ concat
            $ insertSpaceBetweenEnglishWords lineContents


article :: Parser String
article = do
  t <- titleLine
  ps <- many1 paragraph
  many (choice [try emptyLine, try pageLine])

  lookAhead titleLine
          <|> (eof >> return "")
  return $ intercalate "\n"( (t++"\n"):ps)


file :: Parser String
file = do
  ts <- many1 contentLine
  many (choice [try emptyLine, try pageLine])
  as <- many1 article
  lookAhead eof
  return $ "      " ++ concat ts ++ "\n\n"
         ++ intercalate "\n\n\n" as

fileTitle :: Parser String
fileTitle = do
  ts <- anyChar `manyTill` (try titleLine)
  return ts

-- simple test func
testParser :: Parser String -> String -> IO ()
testParser p input =
    case (runParser p () "" input) of
      Left err -> do
        putStr "parse error at "
        print err
      Right x  -> putStrLn x



  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值