;;; synfl-lock.el --- syntactically font-lock a buffer

;; Copyright (C) 2004 Stephen J. Turnbull

;; Author: Stephen J. Turnbull <stephen@xemacs.org>
;; Created: 2004 April 22
;; Keywords: fontlock, programming

;; This file is not part of XEmacs.

;; This program is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License, version 2, as
;; published by the Free Software Foundation.

;; This program is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;; General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with XEmacs; see the file COPYING (C-h C-l).  If not, write to
;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;; This stuff is mostly hacks.

;;; Code:

(require 'synfl)

;; Parse tree API

;; This was a pretty unsuccessful attempt at a generic API for parses.  It's
;; used in the implementation below, but it's pretty yucky and will be 
;; replaced here, too, no doubt.

;; This class has some reason to be relatively complex, as the nodes may be
;; objects defined externally, such as extents for font-locking.  So we
;; can't count on the objects supporting property lists or any other specific
;; way to access structure members.

;; A *node* is an opaque object of some node class with these members:
;;   children (optional) - a list of nodes
;;   symbol - a symbol of a synfl grammar

;; A *tree* is a *node* with a "children" member.  The children are ordered.
;; (In many implementations you can't normally tell the difference, since
;; `get' will return `nil' for a node that has no children as well as for a
;; node that is not a tree.  This is a GoodThang[tm].)

;; A node class has a class definition which is an array.  Elements are
;; mostly accessor and mutator functions.  It must have an accessor for each
;; member defined, and may have a mutator.  If the mutator is not present, then
;; either the member is constant or you must use setf.  If the member is non-
;; constant, then either a mutator or a setf update function must be defined.

;; Accessors and mutators for simple node class definitions
;; All accessors take a single argument NODE.
;; All accessors take two arguments NODE and VALUE.
;; The constructor takes a single argument OBJECT.  The constructor may
;;   choose to construct the node in OBJECT.  Typically the OBJECT is a
;;   token from a token stream.
;; The tree-constructor takes two arguments, NODE and CHILDREN.  CHILDREN is
;;   a list of nodes.
(defsubst synfl-node-class-name (def) (aref def 0))
(defsubst synfl-node-set-class-name (def name) (aset def 0 name))
(defsubst synfl-node-children-accessor (def) (aref def 1))
(defsubst synfl-node-children-set-accessor (def fn) (aset def 1 fn))
(defsubst synfl-node-children-mutator (def) (aref def 2))
(defsubst synfl-node-children-set-mutator (def fn) (aset def 2 fn))
(defsubst synfl-node-symbol-accessor (def) (aref def 3))
(defsubst synfl-node-symbol-set-accessor (def fn) (aset def 3 fn))
(defsubst synfl-node-symbol-mutator (def) (aref def 4))
(defsubst synfl-node-symbol-set-mutator (def fn) (aset def 4 fn))
(defsubst synfl-node-constructor (def) (aref def 5))
(defsubst synfl-node-set-constructor (def fn) (aset def 5 fn))
(defsubst synfl-node-tree-constructor (def) (aref def 6))
(defsubst synfl-node-set-tree-constructor (def fn) (aset def 6 fn))

(defconst synfl-node-predefined-size 7)

;(eval-when-compile (load-library "cl-macs"))

(put 'synfl-node-make-class 'lisp-indent-function 2)

(defun* synfl-node-make-class
  (name size
   &key constructor tree-constructor children set-children symbol set-symbol)

  (let ((def (make-vector size nil)))
    (synfl-node-set-class-name def name)
    (when constructor (synfl-node-set-constructor def constructor))
    (when tree-constructor
      (synfl-node-set-tree-constructor def tree-constructor))
    (when children (synfl-node-children-set-accessor def children))
    (when set-children (synfl-node-children-set-mutator def set-children))
    (when symbol (synfl-node-symbol-set-accessor def symbol))
    (when set-symbol (synfl-node-symbol-set-mutator def set-symbol))
    def))


;; generic functions for trees

(defsubst synfl-node-make-tree (node children)
  (let ((class (get node :node-class)))
    (funcall (synfl-node-children-mutator class) node children)
    node))


(defun synfl-construct-token-regexp (specs)
  "Return a regexp matching a token constructed from SPECS.

Each element of SPECS will have a corresponding capturing group in the regexp."

  (let ((re (concat "\\(" (caar specs)))
	(specs (cdr specs)))
    (while specs
      (setq re (concat re "\\)\\|\\(" (caar specs))
	    specs (cdr specs)))
    (setq re (concat re "\\)"))
    ;; minimal sanity check
    (when (string-match re "")
      (error 'args-out-of-range "the token regexp matches the null string"))
    re))


;; an example token stream implementing font-lock for C

;; token streams are implemented as symbols' properties to avoid need for
;; setting a variable to a plist; this API is specified in synfl.el

;; tokenizer specification

;; #### update this to use the constructor
;; fields required by the token stream API
(put 'lock-c-token-stream :synfl-stream-class #'lock-c-token-stream)
(put 'lock-c-token-stream :iterate     #'lock-c-get-token)
(put 'lock-c-token-stream :rewind      (lambda (x) (goto-char (point-min))))
(put 'lock-c-token-stream :token-type  (lambda (x)
					 (if (eq x :eof)
					     :eof
					   (extent-property x :synfl-token))))
(put 'lock-c-token-stream :token-value (lambda (x) nil))
(put 'lock-c-token-stream :token-text  (lambda (x) (extent-string x)))

;; implementation-specific fields

;; lexer implementation

;; an unordered list of token specs
;; use of regexps to determine context is evil; don't do it; just match tokens
;; (and use `synfl-lex-token-generic' to construct the token) or match prefixes
;; (and use a type-specific function to complete the lexical item)
(put 'lock-c-token-stream :stream-token-spec-list
     ;; List of token specs, each a list of a regexp, a function, and a face.
     ;; Spec count must be less than the maximum capturing groups (255?).

     ;; The regexp may not contain capturing groups.

     ;; The function should take two positions BEGIN and END, which will be
     ;; taken from the match-data for the regexp, and a face name DEFAULT-FACE
     ;; as arguments, and return a token.  It should leave point after the end
     ;; of the token.  If the token is semantically significant, the function
     ;; is responsible for making and updating the entry in the symbol table.

     ;; A token is an extent with a non-`nil' :synfl-token property, and
     ;; (optionally) a non-`nil' face property.

     ;; The regexps should either be very generic, ie an identifier, or
     ;; constructed with regexp-opt.  The lexer will use the strategy of
     ;; priming the symbol table with keywords
     '(("\\\"" synfl-lex-string-c font-lock-string-face)
       ("/\\*" synfl-lex-comment-c font-lock-comment-face)
       ("#" synfl-lex-directive-cpp font-lock-preprocessor-face)
       ;; C operators and delimiters
       (">>=?\\|<<=?" synfl-lex-token-generic font-lock-builtin-face)
       ("[-+*/%&|^!=]=?" synfl-lex-token-generic font-lock-builtin-face)
       ("\\+\\+\\|--\\|[][()?:'{},;><~]" synfl-lex-token-generic
	font-lock-builtin-face)
       ;; identifiers
       ;; #### can get rid of \( and \)+ by using skip-syntax-forward in
       ;; synfl-lex-token-generic, cf synfl-lex-directive-cpp
       ("\\(\\sw\\|\\s_\\)+" synfl-lex-token-generic
	font-lock-variable-name-face)))

;; initialize the symbol table with keywords and type names
(put 'lock-c-token-stream :stream-symbol-table
     (let ((x (make-hash-table :test #'equal)))
       ;; #### these lambdas should be defined as a function users can call
       (mapc (lambda (token)
	       (puthash token
			'((:type . keyword) (:face . font-lock-keyword-face))
			x))
	     '("if" "for" "struct" "typedef" "while" "do" "else" "return"
	       "char" "switch" "case" "default"))
       (mapc (lambda (token)
	       (puthash token
			'((:type . type) (:face . font-lock-type-face))
			x))
	     '("int" "float" "static" "extern" "const" "volatile" "long"
	       "unsigned" "void" "double" "char"))
       x))

;; the actual tokenizer regexp is generated here from the token-spec-list
(put 'lock-c-token-stream :stream-token-re
     (synfl-construct-token-regexp
      (get 'lock-c-token-stream :stream-token-spec-list)))


;; the iterator method

(defun lock-c-get-token (stream)
  "Lex a token from the buffer, advancing point, and returning point.

The token is represented as an extent."

  (if (re-search-forward (get stream :stream-token-re) nil 'always-move)
      (let ((data (synfl-lookup-token-data (get stream
						:stream-token-spec-list))))
	(funcall (third data) (first data) (second data) (fourth data)))
    :eof))



;; extents as nodes
;; #### move the code of :constructor and :tree-constructor into separately
;; defined functions
(defconst lock-c-node-class
  (synfl-node-make-class 'lock-c-node (+ synfl-node-predefined-size 0)
    ;; accessors -- so far, so trivial
    :children		(lambda (node) (extent-property node :children))
    :set-children	(lambda (node children)
			  (set-extent-property node :children children))
    :symbol		(lambda (node) (extent-property node :synfl-token))
    :set-symbol		(lambda (node symbol)
			  ;; is this ever needed for this stream?
			  (when (synfl-terminal-p symbol)
			    (set-extent-property node :synfl-token symbol)))
    ;; making nodes -- easy enough
    :constructor	(lambda (object)
			  (when (and (extentp object)
				     ;; paranoia for debugging
				     (synfl-terminal-p
				      (extent-property object :synfl-token)))
			    (set-extent-property object :node-class
						 lock-c-node-class))
			  object)
    ;; making trees -- where the action is
    :tree-constructor	(lambda (node children)
			  (cond ((and (extentp node)
				      ;; paranoia for debugging
				      (synfl-terminal-p
				       (extent-property node :synfl-token)))
				 ;; paranoia for debugging
				 (when children
				   (error 'syntax-error
					  "Can't reduce a terminal"
					  (extent-property node :synfl-token)))
				 ;; return the node-ified token
				 node)
				((eq node 'token)
				 ;; paranoia for debugging
				 (unless (= 1 (length children))
				   (error 'syntax-error
					  "Wrong number of tokens"
					  1 (length children)))
				 (unless (synfl-terminal-p
					  ;; if (car children) is not an
					  ;; extent, something is seriously
					  ;; wrong
					  (extent-property (car children)
							   :synfl-token))
				   (error 'syntax-error
					  "Can't reduce to"
					  'token (extent-property
						  (car children)
						  :synfl-token)))
				 ;; unwrap the singleton node and return it
				 ;; this collapses the single production
				 ;; (ASU sec. 5.2)
				 (car children))
				((eq node 'tokens)
				 (cond ((= 1 (length children))
					;; children == (token)
					;; prepend an extent for the list
					;; the gymnastics with indirection
					;; through a symbol are for the sake
					;; of synfl-trace which prints out the
					;; stack, which contains the list...
					(let ((sym (gensym)))
					  (set
					   sym
					   (cons (let ((e (car children)))
						   (make-extent
						    (extent-start-position e)
						    (extent-end-position e)))
						 children))
					  sym))
				       ((= 2 (length children))
					;; LIFO, children == (token tokens)
					(let* ((token (car children))
					       (list-sym (cadr children))
					       (elst (symbol-value list-sym))
					       (ext (car elst))
					       (rest (cdr elst)))
					  (set-extent-endpoints
					   ext
					   (min (extent-start-position token)
						(extent-start-position ext))
					   (max (extent-end-position token)
						(extent-start-position ext)))
					  (set list-sym
					       (cons ext (cons token rest)))
					  list-sym))
				       ;; paranoia for debugging
				       ;; there are other errors, though
				       (t (error 'syntax-error
						 "wrong handle length"
						 2 (length children)))))
				((eq node 'program)
				 ;; children = (tokens)
				 (let* ((tokens (symbol-value (car children)))
					(tree (car tokens)))
				   ;; if (car children) is not an extent,
				   ;; something is seriously wrong
				   (set-extent-property tree
							:node-class
							lock-c-node-class)
				   (set-extent-property tree
							:children
							(reverse (cdr tokens)))
				   tree))
				(t (error 'syntax-error "unknown node type"
					  node)))))
  "A node class for making simple trees.")



;; the driver program

(defvar synfl-parse-result nil)

(defun synfl-initial-parse ()
  "Parse the stream of tokens from the buffer."

  (interactive)

  ;; clean up just in case
  (synfl-stream-rewind 'lock-c-token-stream)

  ;; parser specification
  (setq synfl-grammars (delete (assq 'lock-c synfl-grammars) synfl-grammars))

  (synfl-make-grammar 'lock-c
    ;; #### if this used keyword arguments no comments would be needed...
    ;; terminals
    '(generic preprocessor badtoken string comment character-constant)
    ;; nonterminals
    '(program tokens token)
    ;; start symbol
    'program
    ;; productions
    '((program (tokens)			nil)
      (program nil			nil) ; program can be empty
      (tokens (tokens token)		nil)
      (tokens (token)			nil)
      (token (generic)			nil)
      (token (preprocessor)		nil)
      (token (badtoken)			nil)
      (token (string)			nil)
      (token (comment)			nil)
      (token (character-constant)	nil)))

  (save-excursion
    (save-restriction
      (goto-char (point-min))
      ;; #### need to change the API of node classes, they should be
      ;; symbols; it's too easy to add the ' for consistency
      (with-displaying-temp-buffer "*c font-lock trace*"
	(synfl-grammar-print 'lock-c)
	(terpri)
	(synfl-collection-print
	 (synfl-compute-sets-of-slr-items (synfl-find-grammar 'lock-c)))
	(terpri)
	(synfl-grammar-print 'lock-c)
	(terpri)
	(synfl-generate-first 'lock-c)
	(synfl-generate-follow 'lock-c)
	(synfl-generate-parser 'lock-c)
	(synfl-tables-print 'lock-c)
	(setq synfl-parse-result
	      (synfl-lr-parse 'lock-c-token-stream 'lock-c lock-c-node-class)))
      )))



;; Utilities and helper functions

(defun synfl-lookup-token-data (specs)
  "Look up information about parsing the matched token in SPECS.

Return a list `(BEGIN END FUN FACE)' where BEGIN and END are the buffer
positions delimiting the token from the match data, FUN is the function for
constructing the token, and FACE is to be applied to the token for display."

  (let ((i 1))
    (while (and specs (not (match-beginning i)))
      (setq i (1+ i)
	    specs (cdr specs)))
    ;; if specs croaks, something is badly broken
    (cons (match-beginning i)
	  (cons (match-end i)
		(cdar specs)))))    


;; lexer helper functions (in token-spec-list)

(defun synfl-lex-token-generic (begin end default-face)
  "Return synfl token for the string between BEGIN and END.
DEFAULT-FACE may be used to set the token's face.
The :synfl-token property is set to 'generic."

  (goto-char end)
  ;; #### abstract me!
  (let* ((e (make-extent begin end))
	 (entry (gethash (extent-string e)
			 (get 'lock-c-token-stream :stream-symbol-table)))
	 (type (cdr (assq :type entry)))
	 ;; this is probably bogus
	 (face (or (cdr (assq :face entry)) default-face)))
    ;; if type is keyword, could set this to (intern keyword) for use
    ;; of a smarter grammar
    ;; #### we'd like to use lookup, but currently the parser doesn't know
    ;;(set-extent-property e :synfl-token (or type 'generic))
    (set-extent-property e :synfl-token 'generic)
    (when face (set-extent-face e face))
    e))


(defun synfl-lex-directive-cpp (begin end default-face)
  "Return synfl token for the preprocessor directive starting at BEGIN.
DEFAULT-FACE may be used to set the token's face.
The :synfl-token property is set to 'preprocessor."

  (goto-char begin)
  (if (not (bolp))
      (synfl-lex-bad-token begin end 'font-lock-warning-face)
    (goto-char end)
    (skip-syntax-forward "-")
    (skip-syntax-forward "w_")
    ;; #### abstract me! cf other synfl-lex- functions
    (let* ((e (make-extent begin (point)))
	   (entry (gethash (extent-string e)
			   (get 'lock-c-token-stream :stream-symbol-table)))
	   (type (cdr (assq :type entry)))
	   ;; this is probably bogus
	   (face (or (cdr (assq :face entry)) default-face)))
      ;; #### we'd like to use lookup, but currently the parser doesn't know
      ;;(set-extent-property e :synfl-token (or type 'preprocessor))
      (set-extent-property e :synfl-token 'preprocessor)
      (when face (set-extent-face e face))
      e)))


(defun synfl-lex-bad-token (begin end default-face)
  "Return a token marked as a syntax error from BEGIN to END.
DEFAULT-FACE may be used to set the token's face.
The :synfl-token property is set to 'badtoken."

  (let ((e (make-extent begin (point))))
    (set-extent-property e :synfl-token 'badtoken)
    (set-extent-face e (or default-face 'font-lock-warning-face))
    e))


(defun synfl-lex-string-c (begin end default-face)
  "Return synfl token for the string with open quote at BEGIN.
DEFAULT-FACE may be used to set the token's face.
Point is left after the terminating double quote mark.
The :synfl-token property is set to 'string."

  (goto-char end)
  (while (progn (skip-chars-forward "^\"\\\\")
		(forward-char) ; if this errors, buffer is borked
		(when (eql (char-before) ?\\)
		  (forward-char)
		  t)))
  (forward-char 1)
  (let ((e (make-extent begin (point))))
    (set-extent-property e :synfl-token 'string)
    (set-extent-face e (or default-face 'font-lock-string-face))
    e))


(defun synfl-lex-comment-c (begin end default-face)
  "Return synfl token for the string with open comment at BEGIN.
DEFAULT-FACE may be used to set the token's face.
Point is left after the comment terminator.
The :synfl-token property is set to 'comment."

  (goto-char end)
  ;; if the search errors, buffer is borked
  (let ((e (make-extent begin (search-forward "*/"))))
    (set-extent-property e :synfl-token 'comment)
    (set-extent-face e (or default-face 'font-lock-comment-face))
    e))


(defun synfl-lex-charconst-c (begin end default-face)
  "Return synfl token for the string with character constant at BEGIN.
DEFAULT-FACE may be used to set the token's face.
Point is left after the terminating single quote mark.
The :synfl-token property is set to 'character-constant."

  (goto-char end)
  ;; if the search errors, buffer is borked
  (let ((e (make-extent begin (progn
				(skip-chars-forward "^'")
				(forward-char 1)
				(point)))))
    (set-extent-property e :synfl-token 'character-constant)
    (set-extent-face e (or default-face 'font-lock-string-face))
    e))


(provide 'synfl-lock)

;;; synfl-lock.el ends here
