irc bot for announcing rss/atom feeds (http://hackage.haskell.org/package/rss2irc)

root / Utils.hs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
{-# LANGUAGE PatternGuards, BangPatterns, DeriveDataTypeable, OverloadedStrings, ScopedTypeVariables #-}
{- |

Common utilities.

Copyright (c) Don Stewart 2008-2009, Simon Michael 2009-2014
License: BSD3.

-}

module Utils (
  module Utils,
  module Debug.Trace
  )
where
import Codec.Binary.UTF8.String as UTF8 (decodeString, encodeString, isUTF8Encoded)
import Control.Concurrent.MSampleVar
import Control.Monad
import Data.List
import Data.Maybe
import Data.Time.Clock (UTCTime,getCurrentTime)
import Data.Time.Format (parseTimeM)
import Data.Time.LocalTime (LocalTime,getCurrentTimeZone,utcToLocalTime)
import Prelude hiding (log)
import System.Info
import System.IO (stdout,hFlush)
import Data.Time.Format (defaultTimeLocale)
import Text.Feed.Query
import Text.Feed.Types (Item)
import Text.ParserCombinators.Parsec hiding (label)
import Text.Printf (printf)
import Text.RegexPR (splitRegexPR,gsubRegexPR)

import Base

import Debug.Trace
-- | trace a showable expression
strace :: Show a => a -> a
strace a = trace (show a) a
-- | labelled trace - like strace, with a label prepended
ltrace :: Show a => String -> a -> a
ltrace l a = trace (l ++ ": " ++ show a) a
-- | monadic trace - like strace, but works as a standalone line in a monad
mtrace :: (Monad m, Show a) => a -> m a
mtrace a = strace a `seq` return a
-- | trace an expression using a custom show function
tracewith :: (a -> String) -> a -> a
tracewith f e = trace (f e) e


-- Light abstraction layer for thread-safe mutable data

type Shared a = MSampleVar a

newSharedVar :: a -> IO (MSampleVar a)
newSharedVar = newSV

getSharedVar :: MSampleVar a -> IO a
getSharedVar v = do
  x <- readSV v
  writeSV v x
  return x

putSharedVar :: MSampleVar a -> a -> IO ()
putSharedVar v x = writeSV v x

-- Option parsing helpers

ircAddressFromOpts :: Opts -> Maybe IrcAddress
ircAddressFromOpts Opts{irc_address=""} = Nothing
ircAddressFromOpts Opts{irc_address=a} = Just $ parseIrcAddress a

parseIrcAddress :: String -> IrcAddress
parseIrcAddress a =
  either (\e -> opterror $ printf "could not parse IRC address \"%s\"\n%s\n" a (show e))
         id
         $ parse ircaddrp "" a
  where
    ircaddrp = choice' $ [
      do
        -- pre 0.5 syntax: [irc://]NICK@IRCSERVER[:PORT]/[#]CHANNEL
        optional $ choice' $ map string ["irc://", "irc:"]
        n <- many1 $ noneOf "@"
        char '@'
        s <- many1 $ noneOf ":/"
        p <- optionMaybe $ char ':' >> read `fmap` many1 digit >>= return
        char '/'
        optional $ char '#'
        c <- many1 $ noneOf "/ \t"
        eof
        return $ IrcAddress s p ('#':c) n
      ,
      do
        -- new easier syntax: [irc://]IRCSERVER[:PORT]/[#]CHANNEL/NICK
        optional $ choice' $ map string ["irc://", "irc:"]
        s <- many1 $ noneOf ":/"
        p <- optionMaybe $ char ':' >> read `fmap` many1 digit >>= return
        char '/'
        optional $ char '#'
        c <- many1 $ noneOf "/"
        char '/'
        n <- many1 $ noneOf "/ \t"
        eof
        return $ IrcAddress s p ('#':c) n
      ]

-- | A version of error' that suggests --help.
opterror :: String -> a
opterror = error' . (++ " (see --help for usage)")

-- | A version of error that's better at displaying unicode.
error' :: String -> a
error' = error . toPlatformString

-- | Convert a feed item to a string for the bot to announce on irc.
-- The announcement is likely but not guaranteed to fit within a
-- single irc message.
toAnnouncement:: Opts -> Item -> String
toAnnouncement opts i = applyReplacements opts $ printf "%s%s%s%s%s" title desc author' date link'
    where
      title = unlessopt no_title $ maybe "" (truncateWordsAt maxtitlelength "..." . clean) (getItemTitle i)
      desc = ifopt description $ maybe "" ((" - "++) . truncateWordsAt maxdesclength "..." . clean) (getItemDescription i)
      author' = ifopt author $ maybe "" ((" "++) . parenthesise . truncateWordsAt maxauthorlength "..." . clean) (getItemAuthor i)
      date = ifopt time $ maybe "" ((" "++) . truncateAt maxdatelength "..." . clean) (getItemDate i)
      link' = ifopt link_ $ maybe "" (("  "++) . truncateAt maxlinklength "..." . clean) (getItemLink i)

      clean = oneline . trimwhitespace . striphtml . stripemail
      ifopt o = if o opts then id else const ""
      unlessopt o = if not $ o opts then id else const ""
      oneline = intercalate "  " . map strip . lines -- two spaces to hint at newlines & brs
      trimwhitespace = gsubRegexPR "[ \t][ \t]+" " "
      striphtml = if html opts then id else stripHtml . brtonewline
      brtonewline = gsubRegexPR "(<|&lt;) *br */?(>|&gt;)" "\n"
      stripemail = if email opts then id else stripEmails
      parenthesise = (++")").("("++)

-- | Split an announcement into one or more suitably truncated and
-- formatted irc messages. Each call returns the next message and
-- the remainder of the announcement.
-- XXX n must be > length continuationsuffix
splitAnnouncement :: String -> (String,String)
splitAnnouncement a
    | length a <= maxmessagelength = (a,"")
    | otherwise =
        case splitAtWordBefore n a of
          (m,rest@(_:_)) -> (m++continuationsuffix, continuationprefix++rest)
          (m,"")         -> (m, "")
    where
      n = maxmessagelength - length continuationsuffix

continuationprefix, continuationsuffix :: String
continuationprefix = "... "
continuationsuffix = " ..."

-- | Truncate a string, if possible at a word boundary, at or before
-- the specified position, and indicate truncation with the specified
-- suffix. The length of the returned string will be in the range
-- n, n+length suffix.
truncateWordsAt :: Int -> String -> String -> String
truncateWordsAt n suffix s
    | s' == s   = s
    | otherwise = s' ++ suffix
    where
      s' = fst $ splitAtWordBefore n s

-- | Truncate a string at the specified position, and indicate
-- truncation with the specified suffix. The length of the returned
-- string will be in the range n, n+length suffix.
truncateAt :: Int -> String -> String -> String
truncateAt n suffix s
    | s' == s   = s
    | otherwise = s' ++ suffix
    where
      s' = take n s

-- | Split a string at or before the specified position, on a word boundary if possible.
splitAtWordBefore :: Int -> String -> (String,String)
splitAtWordBefore n s
    | null a || (null b) = (rstrip a, lstrip b)
    | last a == ' ' || (head b == ' ') || (not $ ' ' `elem` a) = (rstrip a, lstrip b)
    | otherwise = (rstrip $ take (length a - length partialword) a, partialword ++ lstrip b)
    where (a,b) = splitAt n s
          partialword = reverse $ takeWhile (/= ' ') $ reverse a


-- | Apply all --replace substitutions to a string, in turn.
-- Warning, will fail at runtime if there is a bad regexp.
applyReplacements :: Opts -> String -> String
applyReplacements opts = foldl' (.) id (reverse substitutions)
    where substitutions = map replaceOptToSubst $ replace opts 
          replaceOptToSubst s = case splitRegexPR "(?<!\\\\)/" s of
                     (pat:sub:[]) -> gsubRegexPR pat sub
                     _ -> id

-- | Replace any HTML tags or entities in a string with a single space.
stripHtml :: String -> String
stripHtml = gsubRegexPR "(&[^ \t]*?;|<.*?>)" " "

-- | Remove any email addresses from a string.
stripEmails :: String -> String
stripEmails = gsubRegexPR "(?i) ?(<|&lt;)?\\b[-._%+a-z0-9]+@[-.a-z0-9]+\\.[a-z]{2,4}\\b(>|&gt;)?" ""

maybeRead :: Read a => String -> Maybe a
maybeRead s = case reads s of
    [(x, _)] -> Just x
    _        -> Nothing

decrementMaybe :: Enum a => Maybe a -> Maybe a
decrementMaybe = maybe Nothing (Just . pred)

-- | Parse a datetime string if possible, trying at least the formats
-- likely to be used in RSS/Atom feeds.
parseDateTime :: String -> Maybe UTCTime
parseDateTime s = firstJust [parseTimeM True defaultTimeLocale f s' | f <- formats]
    where
      s' = adaptForParseTime s
      adaptForParseTime = gsubRegexPR "(....-..-..T..:..:..[\\+\\-]..):(..)" "\\1\\2" -- 2009-09-22T13:10:56+00:00
      formats = -- http://hackage.haskell.org/packages/archive/time/1.1.4/doc/html/Data-Time-Format.html#v%3AformatTime
          [
           "%a, %d %b %Y %T %z" -- Fri, 18 Sep 2009 12:42:07 -0400
          ,"%a, %d %b %Y %T %Z" -- Fri, 25 Sep 2009 11:01:23 UTC
          ,"%Y-%m-%dT%T%z"      -- 2009-09-22T13:10:56+0000
          ]

firstJust :: [Maybe a] -> Maybe a
firstJust ms = case dropWhile isNothing ms of (m:_) -> m
                                              _     -> Nothing

-- | Grammatically correct "every N minutes".
everyMinutesString :: Int -> String
everyMinutesString 1 = "every minute"
everyMinutesString i = "every " ++ show i ++ " minutes"

-- | Grammatically correct "in N minutes".
inMinutesString :: Int -> String
inMinutesString 1 = "in 1 minute"
inMinutesString i = "in " ++ show i ++ " minutes"

-- | Log some text to the console with a timestamp.
log :: String -> IO ()
log s = do
  t <- getTimeStamp
  putStrLn $ printf "%s: %s" t s
  hFlush stdout

-- | Decorate some multi-line text with a label and start/end separators.
labelledText :: String -> String -> String
labelledText label s = printf "========== %s:\n%s\n=============================================\n" label s

getCurrentLocalTime :: IO LocalTime
getCurrentLocalTime = do
  t <- getCurrentTime
  tz <- getCurrentTimeZone
  return $ utcToLocalTime tz t

getTimeStamp :: IO String
getTimeStamp = do
  t <- getCurrentLocalTime
  tz <- getCurrentTimeZone
  return $ printf "%s %s" (take 19 $ show t) (show tz)

hours, minutes, seconds :: Int
hours = 60 * minutes
minutes = 60 * seconds
seconds = 10^(6::Int)

strip, lstrip, rstrip, dropws :: String -> String
strip = lstrip . rstrip
lstrip = dropws
rstrip = reverse . dropws . reverse
dropws = dropWhile (`elem` (" \t"::String))

chomp :: String -> String
chomp = reverse . dropWhile (`elem` ("\n\r"::String)) . reverse

isLeft :: Either a b -> Bool
isLeft (Left _) = True
isLeft _        = False

-- platform strings

-- | A platform string is a string value from or for the operating system,
-- such as a file path or command-line argument (or environment variable's
-- name or value ?). On some platforms (such as unix) these are not real
-- unicode strings but have some encoding such as UTF-8. This alias does
-- no type enforcement but aids code clarity.
type PlatformString = String

-- | Convert a possibly encoded platform string to a real unicode string.
-- We decode the UTF-8 encoding recommended for unix systems
-- (cf http://www.dwheeler.com/essays/fixing-unix-linux-filenames.html)
-- and leave anything else unchanged.
fromPlatformString :: PlatformString -> String
fromPlatformString s = if UTF8.isUTF8Encoded s then UTF8.decodeString s else s

-- | Convert a unicode string to a possibly encoded platform string.
-- On unix we encode with the recommended UTF-8
-- (cf http://www.dwheeler.com/essays/fixing-unix-linux-filenames.html)
-- and elsewhere we leave it unchanged.
toPlatformString :: String -> PlatformString
toPlatformString = case os of
                     "unix" -> UTF8.encodeString
                     "linux" -> UTF8.encodeString
                     "darwin" -> UTF8.encodeString
                     _ -> id

-- | Backtracking choice, use this when alternatives share a prefix.
-- Consumes no input if all choices fail.
choice' :: [GenParser tok st a] -> GenParser tok st a
choice' = choice . map Text.ParserCombinators.Parsec.try