Well, here's my solution to the problem(signed up to post it :) ):
It's in python(currently using the 2.x syntax but the conversion to 3.x is simple, just use print() instead of print)
class ParseException(Exception):
"""
Convenient way to indicating errors to be reported to the user
"""
def __init__(self, reason, line):
self.reason = reason
self.line = line
def __str__(self):
return "Parse error on line %d: %s" % (self.line, self.reason)
def parse_string(text):
#TODO do this without loading the entire file into memory
"""
State machine that splits up the incoming text into a logical form
that can be more easily processed later
Returns a dict which maps selectors into their bodies. Selectors
can be any string, whitespace(including newlines) before or after
elements(selectors, property, value) are ignored. The body of a
selector consists of a list of dicts which describe each property
affecting this selector.
Some metadata is included such as which line things occur on for
better error reporting later.
May error out if the file is malformed at a very basic level, such
as misplaced or mismatched braces. The selector and body syntax
itself is not checked this is delegated to users of this function
"""
# possible states:
# selector: we start out here and enter the body for an opening
# brace we come back to it after a closing brace
#
# key: we enter here after an opening brace in a selector. we exit
# upon reading a ":" character
#
# value: we enter from reading_key after a ":" character. we exit
# either at "}" or ";". I'm assuming it's legal for the
# last(or single) statement in a css body not to be
# terminated by a semi-colon(not semi-column :P)
state = "selector"
result = {}
buf = ""
cur_line = 1
cur_selector = None
cur_key = None
for char in text:
if char == "{":
if state != "selector":
raise ParseException("illegal '{' inside a selector body", cur_line)
cur_selector = buf.strip()
buf = ""
if cur_selector not in result:
result[cur_selector] = []
state = "key"
elif char == "\n":
# All end of lines show up as "\n" even on windows as long
# as the file is opened in text mode
cur_line += 1
buf += char
elif char == "}":
if state == "selector":
raise ParseException("illegal '}' inside a selector", cur_line)
if state == "key" and buf.strip() != "":
raise ParseException("illegal '}' inside a property", cur_line)
if buf.strip() != "":
# there's a dangling key:value that hasn't been
# inserted yet
# TODO line reporting here isn't very accurate
# (multiline property definition?), consider a better
# strategy maybe?
result[cur_selector].append({'property':cur_key,'value':buf, 'line':cur_line})
buf = ""
state = "selector"
elif char == ";":
if state != "value" or buf.strip() == "":
raise ParseException("Illegal ';'", cur_line)
result[cur_selector].append({'property':cur_key, 'value':buf.strip(), 'line':cur_line})
buf = ""
state = "key"
elif char == ":":
if state != "key" or buf.strip() == "":
raise ParseException("Illegal ':'", cur_line)
state = "value"
cur_key = buf.strip()
buf = ""
else:
buf += char
return result
if __name__ == "__main__":
import sys,pprint
pp = pprint.PrettyPrinter(indent=4)
if len(sys.argv) < 2:
print "Usage: %s <file to parse> [<other file> [...]]" % sys.argv[0]
for fname in sys.argv[1:]:
print "Parsing %s" % fname
try:
pp.pprint(parse_string(open(fname).read()))
except ParseException, e:
print "Invalid!"
print e
except IOError,e:
print "Cannot read file!"
print e
Here is the result when run on all three examples:
$ python cssvalidator.py test_file.css test_file2.css test_file3.css
Parsing test_file.css
{ 'SELECTOR1': [ { 'line': 2, 'property': 'PROPERTY1', 'value': 'VALUE'},
{ 'line': 3, 'property': 'PROPERTY2', 'value': 'VALUE'}],
'SELECTOR2': [ { 'line': 7, 'property': 'PROPERTY1', 'value': 'VALUE'},
{ 'line': 8, 'property': 'PROPERTY2', 'value': 'VALUE'}]}
Parsing test_file2.css
{ '.foo': [ { 'line': 6, 'property': 'color', 'value': 'Black'},
{ 'line': 7, 'property': 'border-style', 'value': 'solid'}],
'body': [{ 'line': 2,
'property': 'background-color',
'value': '#FF0000'}]}
Parsing test_file3.css
Invalid!
Parse error on line 1: illegal '{' inside a selector body
This parser essentially massages the input into a form which is easy to later validate in other ways(specific rules about valid selector/property/value specifications for example). It only checks for very basic validity itself.
I ran it on a pretty large css file from a project I'm currently working on and it worked just fine. If anyone has any examples which break my code(either by getting it to error out on a valid input or not do so on invalid input) let me know :)