preprocessor.fsr

;;; -*- mode: fundamental; coding: utf-8; indent-tabs-mode: t; -*-


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; tokenization pattern: after normalization, the string will be broken up at
;;; each occurrence of this pattern; the pattern match itself is deleted.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
:[ \t]+

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; string rewrite rules: all matches, over the entire string, are replaced by
;;; the right-hand side; grouping (using `(' and `)') in the pattern) and group
;;; references (`\1' for the first group, et al.) carry over part of the match.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;
;;; pad the full string with trailing and leading whitespace; makes matches for
;;; word boundaries a little easier down the road.
;;;
;!^(.+)$					 \1 
!^(.+)$					 ^\1 

;;;
;;; separate hash sign from right-adjacent number(s)
;;;
!([#])([0-9])				\1 \2

;;;
;;; Eliminate spurious space preceding ordinary punctuation
;;;
! ([:;,.:!\)\?])		\1

;;;
;;; Replaces quotation marks with nothing
;;;

![,«»"?*]					  

;+(.*)([s.])		\1 \2


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; from here on, token-specific rules, i.e. the pattern has to match the full
;;; string of the token (implicit `^' and `$' anchoring).  three types of rules
;;; for now: (i) substitution (`-'), replacing the token with the right-hand
;;; side match, (ii) augmentation (`+'), adding an alternative spelling for the
;;; token, and ersatzing (`^'), effectively a substitution but recording what
;;; the original string was for later retrieval 
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+[-a-zA-Z0-9]+@[-.a-zA-z0-9]+			EmailErsatz

;;;
;;; Rules for splitting off parentheses, colons and commas:
;;;

!(\()([-a-zA-Z0-9-æøåÆØÅ]+)(\))		\1 \2 \3
!(\()([-a-zA-Z0-9-æøåÆØÅ]+)		\1 \2
!([-a-zA-Z0-9-æøåÆØÅ]+)(\))		\1 \2

!([-a-zA-Z0-9-æøåÆØÅ]+)(:)		\1 \2

!([-a-zA-Z0-9-æøåÆØÅ]+)(,)		\1 \2


!([a-zA-ZæøåÆØÅ])([/])([a-zA-ZæøåÆØÅ])		\1 \2 \3
!([0-9])([/])([a-zA-ZæøåÆØÅ])			\1 \2 \3
!([a-zA-ZæøåÆØÅ])([/])([0-9])			\1 \2 \3

;fix_me
;^[0-9]+\.[0-9]+				DecimalErsatz
+[0-9]+,[0-9]+					DecimalErsatz
+[0-9]{1,2}/[0-9]{1,2}				FractionErsatz
+[0-9]{1,3},[0-9]{3}				NumberErsatz
+[0-9]{1,3},[0-9]{1,3},[0-9]{3}			NumberErsatz


+[2-9]{1}					OneDigitErsatz
+[0-9]{2,}					TwoPlusDigitsErsatz

;+[0-9]+[ ][0-9]{3}				TwoPlusDigitsErsatz
;+[0-9]+[ ][0-9]{3}[ ][0-9]{3}			TwoPlusDigitsErsatz

;-OneDigitErsatz \.		OneDigitErsatzDef
;-TwoPlusDigitsErsatz \.		TwoPlusDigitsErsatzDef

+[1-9]{1}\.					OneDigitErsatzDef
+[0-9]{2}\.					TwoDigitErsatzDef
;+[0-9]{3,}\.					ThreePlusDigitErsatzDef

;+[0-9\ ]+ ?[-] ?[0-9\ ]+			BetweenDigitsErsatz

+[0-2]?[0-9]:[0-5][0-9]				ClockTimeErsatz

+<?http://.*>?					WebErsatz
+<?www\.[a-zA-Z0-9.?%/_-]+>?			WebErsatz
+<?[a-zA-Z0-9_-]+@[a-zA-Z0-9._-]+>?		EmailErsatz

+'[0-9][0-9]					YearErsatz

-(.*) (\.)