DZone Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world
Haskell Regular Expression Matcher
Basic implementation of Regular Expressions based on "Derivatives of Regular Expressions" by Janusz A. Brzozowski (Journal of Association for Computing Machinery, October 1964)
Not really intended for serious use. Just a proof of concept.
module Regexp
where
import Data.Set (Set)
import Data.Map (Map)
import Monad
import List
import Maybe
import qualified Data.Set as Set
import qualified Data.Map as Map
data Regexp =
Zero
| Match Char -- matches a single char
| Not Regexp -- matches the negation of its argument
| Prod [Regexp] -- matches a concatentation of its arguments
| Sum (Set Regexp) -- matches either of its arguments
| Star Regexp -- matches repetitions of its argument (including 0 repetitions)
deriving (Eq, Ord)
instance Show Regexp where
show Zero = "0"
show (Match c) = [c]
show (Not x) = '~' : show x
show (Prod x) = join . (map show) $ x
show (Sum x) = "(" ++ ( join . intersperse "|" . (map show) . Set.toList $ x ) ++ ")"
show (Star x) = "(" ++ show x ++ ")*"
-- Flagrant abuse of type classes to allow implicit conversion of datatypes into regular
-- expressions.
class Match a where
match :: a -> Regexp
instance Match Char where
match c = Match c
instance (Match a) => Match [a] where
match = con
instance Match Regexp where
match = id
-- "smart" versions of the constructors, which perform normalisation of the datatype.
-- As long as all regular expressions are built up using these and the match instance
-- for char we can guarantee that structural equality of terms == similarity.
-- This is important to make sure we only generate a finite number of states.
zero :: Regexp
zero = Zero
one :: Regexp
one = Prod []
(<+>) :: (Match a, Match b) => a -> b -> Regexp
x <+> y =
case (match x, match y) of
(Zero, b) -> b
(a, Zero) -> a
(Sum a, Sum b) -> Sum (Set.union a b)
(Sum a, b) -> Sum (Set.insert b a)
(a, Sum b) -> Sum (Set.insert a b)
(a, b) -> Sum $ Set.fromList [a, b]
oneOf :: (Match a) => [a] -> Regexp
oneOf = foldr (<+>) zero
(<*>) :: (Match a, Match b) => a -> b -> Regexp
u <*> v =
case (match u, match v) of
(Zero, _) -> zero
(_, Zero) -> zero
(Prod x, Prod y) -> Prod (x ++ y)
(Prod x, y) -> Prod (x ++ [y])
(x, Prod y) -> Prod (x : y)
(x, y) -> Prod [x, y]
con :: (Match a) => [a] -> Regexp
con = foldr (<*>) one
neg :: (Match a) => a -> Regexp
neg x =
case (match x) of
(Not y) -> y
y -> Not y
star :: (Match a) => a -> Regexp
star x =
case (match x) of
(Zero) -> Zero
(Star y) -> Star y
y -> Star y
-- Returns if the regex matches the empty string.
del :: Regexp -> Bool
del (Zero) = False
del (Sum x) = or . map del $ Set.toList x
del (Prod x) = and . map del $ x
del (Match _) = False
del (Not x) = not $ del x;
del (Star _) = True
-- The derivative of a regular language A with respect to a character
-- c is dA/dc = { s : cs \in A }
diff :: Char -> Regexp -> Regexp
diff _ (Zero) = zero
diff c (Match d) | (c == d) = one
diff c (Match d) = zero
diff c (Sum x) = oneOf $ (map $ diff c) (Set.toList x)
diff c (Prod []) = zero
diff c (Prod (x:xs)) | del x = (diff c x <*> xs) <+> diff c (Prod xs)
diff c (Prod (x:xs)) = diff c x <*> xs
diff c (Not x) = Not (diff c x)
diff c (Star x) = diff c x <*> Star x
flattenSet :: (Ord a) => Set (Set a) -> Set a
flattenSet = Set.fold Set.union Set.empty
(/>>=) :: (Ord a, Ord b) => Set a -> (a -> Set b) -> Set b
x />>= f = flattenSet (Set.map f x)
-- The alphabet of all characters that appear in this regexp
alphabet :: Regexp -> Set Char
alphabet (Zero) = Set.empty
alphabet (Sum x) = flattenSet (Set.map alphabet x)
alphabet (Prod x) = Set.unions $ map alphabet x
alphabet (Not x) = alphabet x
alphabet (Star x) = alphabet x
alphabet (Match c) = Set.singleton c
-- Set of all derivatives of a regular expression (including itself, and higher order derivatives).
derivatives :: Regexp -> [Regexp]
derivatives exp = Set.toList $ enlarge (Set.singleton exp) (Set.singleton exp)
where
alpha = alphabet exp
firstDerivatives x = Set.map (`diff` x) alpha
enlarge :: Set Regexp -> Set Regexp -> Set Regexp
enlarge new found =
if Set.null new
then found
else
let nextNew = (new />>= firstDerivatives) Set.\\ found
nextFound = found `Set.union` nextNew
in enlarge nextNew nextFound
-- A simple finite state machine type
data FSM = State { transitions :: (Map Char FSM), isFinal :: Bool }
-- Converts a Regexp into a finite state machine by using the derivatives
-- with respect to specific characters as the transitions. Essentially at
-- each stage we build up a regular expression that the remaining characters
-- have to match. Due to Cunning Mathematics, only finitely many such regular
-- expressions (up to similarity) result.
compile :: Regexp -> FSM
compile x = fromJust $ Map.lookup x states
where
states :: Map Regexp FSM
states = Map.fromList $
do re <- derivatives x -- Totally gratuitious use of list monad. :)
let trans = do c <- Set.toList $ alphabet re
let d = diff c re
return (c, fromJust $ Map.lookup d states)
let state = State (Map.fromList trans) (del re)
return (re, state)
runFSM :: FSM -> String -> Bool
runFSM x [] = isFinal x
runFSM x (c:cs) = case (Map.lookup c $ transitions x) of
Nothing -> False
Just y -> runFSM y cs
matches :: (Match a) => String -> a -> Bool
matches cs exp = runFSM (compile $ match exp) cs






