CFG Reader(NT-Table)
A. Seventh Edition
This is seventh edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition,
I finished the establishing NT-table and mapping of token-type from "scanner" with terminals in my grammar.
M -> program i ; Dl B
Dl -> Dv Ml
Ml -> Ml Mo | e
Mo -> module i ( Vl ) Dv B
Dv -> variables Vl | e
Vl -> Vl V | V
V -> Il : T ;
T -> integer Ad ; | char Ad
Il -> i | Il , i
L -> i Ar
Ad -> e | array [ n ]
B -> begin Sl end ;
Sl -> S | S Sl
S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B |
read Ln ; | write Lo ; | e ;
Lp -> e | Ln
Ln -> L { , L }
Lo -> Lr { , Lr }
Lr -> i Ar | n | c
Ar -> e | [ E ]
E -> F { Oa F }
Oa -> + | -
F -> R { Om R }
Om -> * | /
R -> L | n | ( E ) | c
C -> E Or E
Or -> = | < | > | <= | >= | !=
I cannot remove Tokens! I cannot discover INDIRECT common-left-factors. I cannot remove redundant variables.
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
file name: CFGReader.h
#include "Grammar.h"
class CFGReader
{
private:
char buffer[BufferLength];
FILE* stream;
void newRule(const char* str);
void readRule();
void addRule(const char* str);
int addToken(const char* str, bool isTerminal=true);
void printRule(int index);
public:
void readFromFile(const char* fileName);
void optimize();
void print();
};
file name: CFGReader.cpp
#include <iostream>
#include "CFGReader.h"
using namespace std;
Grammar grammar;
const char* deliStr=" |\n";
//this would only remove the immediate left recursion
//and should I expand it to remove all kinds of left-recursion?
void CFGReader::readFromFile(const char* fileName)
{
if ((stream=fopen(fileName, "r"))==NULL)
{
cout<<"cannot open file "<<fileName<<endl;
}
else
{
readRule();
}
}
void CFGReader::readRule()
{
char* ptr=buffer;
char* hold;
int count=0;
while (!feof(stream))
{
count=0;
fgets(buffer, BufferLength-1, stream);
ptr=buffer;
while ((ptr=strtok(ptr, " \n"))!=NULL)
{
if (strcmp(ptr, "|")!=0)
{
//test the first two token to see if old rule continues
if (count==0)
{
hold=ptr;
}
if (count==1)
{
if (strcmp("->", ptr)==0)
{
/*
curIndex=addToken(hold, false);//the index of cur token
grammar[curIndex]->terminal=false;
newRule();
*/
newRule(hold);
}
else
{
addRule(hold);
addRule(ptr);
}
}
if (count>=2)//always add
{
addRule(ptr);
}
count++;
}
else
{
//this is an alternative production rule
newRule(NULL);
}
ptr=NULL;
}
}
}
void CFGReader::newRule(const char* str)
{
grammar.newRule(str);
}
void CFGReader::optimize()
{
grammar.optimize();
}
void CFGReader::addRule(const char* str)
{
grammar.addRule(str);
}
int CFGReader::addToken(const char* str, bool isTerminal)
{
return grammar.addToken(str, isTerminal);
}
void CFGReader::printRule(int index)
{
grammar.printRule(index);
}
void CFGReader::print()
{
grammar.print();
grammar.printToken(true);
grammar.buildTable();
grammar.printTable();
//grammar.printAllRules();
}
file name: Grammar.h
const int BufferLength=256;
const int MaxTokenCount=80;
const int MaxRHSCount=80;
const int MaxRuleCount=200;
const int MaxTokenLength=10;
const int MaxProduction=10;
struct Token
{
//bool isMeta;
bool terminal;
bool isNull;
char* name;
int production[MaxProduction];//pointer to the production it gives
int count;
int firstCount;
int followCount;
int follow[MaxTokenCount];
int first[MaxTokenCount];
};
class Grammar
{
private:
int table[MaxTokenCount][MaxTokenCount];
int variables[MaxTokenCount];
int terminals[MaxTokenCount];
int tCount;
int nCount;
Token* token[MaxTokenCount];
int tokenCount;
int curIndex;//the index of current token
int curPos;//the position at production rule
//MaxRuleCount>=MaxVariableCount!!!
int production[MaxRuleCount][MaxRHSCount];
int prodIndex;//pointing to current production, initialized to -1
int checkRecursion(int tIndex, int curToken);
void grammarError(int theVar, int theToken, int rule1, int rule2);
void addFirstIntoTable(int theVariable, int theFirst, int theRule);
void addFollowIntoTable(int theVariable, int theRule);
void addBeginEnd();
bool addIntoFirst(int target, int source);
bool addFirst(int target, int source);
void shiftRule(int ruleIndex, bool Left2Right, int offset);
void addAtEnd(int ruleIndex, int toAdd);
void addRuleForToken(int tIndex, int ruleIndex);
int copyRule(int source, int target);//return the position of -1
void replaceRule(int curToken, int ruleIndex);
int findMetaSymbol(int& begin, int& end);
void initialize();
void doReplaceMetaSymbol(int ruleIndex, int begin, int end);
void removeRuleFromToken(int tIndex, int ruleIndex);
void checkEpsilon(int ruleIndex);
void removeImmediate(int tIndex, int ruleIndex);
void doAddRule(int tIndex);
void commonLeftFactor();
int findCommonFactor(int tokenIndex, int ruleIndex);
void removeLeftRecursion();
void doCommonFactor(int tokenIndex, int ruleIndex, int index);
int forgeToken(int index);//create a new variable
void epsilonRule(int ruleIndex);
bool isRedundant(int tIndex);
void replaceMetaSymbol();
void calculateFirst();
void calculateFollow();
void calculateNull();
bool Null(int tIndex);
bool addFollow(int target, int source);
bool addIntoFollow(int target, int source);
bool addFollowIntoFollow(int target, int source);
void removeRedundant();
void removeToken(int tIndex);
// void calculateProperty();
public:
Grammar();
void buildTable();
void optimize();
void printRule(int index);
void print();
void printTable();
void printAllRules();
void printToken(bool onlyVar=false);
int variableCount();
int terminalCount();
void addRule(const char* str);
void newRule(const char* str);//this is an internal method, not suitable for outside
Token* operator[](int index);
int addToken(const char* str, bool isTerminal=true);
Token* createToken(const char* theName, bool isTerminal);
int tokenIndex(const char* str);
};
file name: Grammar.cpp
#include <iostream>
#include "Grammar.h"
using namespace std;
const int TokenTypeCount=38;
const char* emptyStr="e";
const char* optionBegin="{";
const char* optionEnd="}";
const char* startStr="START";
const char* endStr="$";
int startSymbolIndex=-1;
int stackBottomIndex=-1;
int beginIndex=-1;
int endIndex=-1;
int emptyIndex=-1;
char* terminalStr[TokenTypeCount]=
{
//GENERAL TYPE 5
"i", "n", "c", "COMMENT", "ERROR",
//THE FOLLOWING ARE SYMBOL TYPE 18
"(", ")", ";", "+", "-", "*",
"/", ":=", "<", ">", "=", "<=",
">=", "!=", "[", "]", ",",
":",
//THE FOLLOWING ARE RESERVED TYPE 15
"begin", "end", "program", "variables","integer", "array", "char",
"module", "if", "then", "else", "loop", "exit", "read", "write"
};
//these are "hash-table-like" tables
int token2type[MaxTokenCount];
int type2token[MaxTokenCount];
int matchTokens(const char* str)
{
for (int i=0; i<TokenTypeCount; i++)
{
if (strcmp(terminalStr[i], str)==0)
{
return i;
}
}
return -1;
}
void Grammar::printTable()
{
/*
cout<<" ";
for (int i=0; i<tokenCount; i++)
{
cout<<"| "<<i;
}
cout<<endl;
*/
for (int r=0; r<tokenCount; r++)
{
if (token[r]->terminal)
{
continue;
}
cout<<token[r]->name<<":";
for (int c=0; c<tokenCount; c++)
{
if (table[r][c]!=-1)
{
cout<<token[c]->name<<"="<<table[r][c]<<",";
}
}
cout<<endl;
}
}
void Grammar::buildTable()
{
int typeIndex;
for (int r=0; r<tokenCount; r++)
{
if (token[r]->terminal)
{
typeIndex=matchTokens(token[r]->name);
if (typeIndex!=-1)
{
type2token[typeIndex]=r;
token2type[r]=typeIndex;
}
}
//initialize
for (int c=0; c<MaxTokenCount; c++)
{
table[r][c]=-1;
}
}
for (int i=0; i<tokenCount; i++)
{
if (token[i]->terminal)
{
continue;
}
for (int j=0; j<token[i]->count; j++)
{
int k=0, theRule=token[i]->production[j];
int theToken=production[theRule][k];
while (theToken!=-1)
{
addFirstIntoTable(i, theToken, theRule);
if (!token[theToken]->isNull)
{
break;
}
k++;
theToken=production[theRule][k];
}
if (theToken==-1)
{
addFollowIntoTable(i, theRule);
}
}
}
}
/*
nCount=tCount=0;
for (int r=0; r<MaxTokenCount; r++)
{
if (r<tokenCount)
{
if (token[r]->terminal)
{
typeIndex=matchTokens(token[r]->name);
if (typeIndex!=-1)
{
type2token[typeIndex]=r;
token2type[r]=typeIndex;
}
//I am making a invertable table!!!
tArray[r]=tCount++;
}
else
{
//it is an invertable table!!!
nArray[r]=nCount++;
}
}
for (int c=0; c<MaxTokenCount; c++)
{
table[r][c]=-1;
}
}
for (int i=0; i<tCount; i++)
{
for (int j=0; j<token[i]->count; j++)
{
int k=0, theRule=token[i]->production[j];
int theToken=production[theRule][k];
while (theToken!=-1)
{
addFirstIntoTable(theToken, j);
if (!token[theToken]->isNull)
{
break;
}
k++;
theToken=production[theRule][k];
}
if (theToken==-1)
{
addFollowIntoTable(j);
}
}
}
}
*/
void Grammar::grammarError(int theVar, int theToken, int rule1, int rule2)
{
cout<<"error! the conflict of grammar at ";
cout<<token[theVar]->name<<" for token of "
<<token[theToken]->name
<<" between rules of \n";
printRule(rule1);
cout<<" and ";
printRule(rule2);
cout<<endl;
}
void Grammar::addFollowIntoTable(int theVariable, int theRule)
{
int temp;
for (int i=0; i<token[theVariable]->followCount; i++)
{
temp=token[theVariable]->follow[i];
if (table[theVariable][temp]!=theRule)
{
if (table[theVariable][temp]!=-1)
{
grammarError(theVariable, temp, theRule, table[theVariable][temp]);
}
table[theVariable][temp]=theRule;
}
}
}
void Grammar::addFirstIntoTable(int theVariable, int theFirst, int theRule)
{
for (int i=0; i<token[theFirst]->firstCount; i++)
{
if (table[theVariable][token[theFirst]->first[i]]!=theRule)
{
if (table[theVariable][token[theFirst]->first[i]]!=-1)
{
grammarError(theVariable, token[theFirst]->first[i], theRule,
table[theVariable][token[theFirst]->first[i]]);
}
table[theVariable][token[theFirst]->first[i]]=theRule;
}
}
}
int Grammar::variableCount()
{
int result=0;
for (int i=0; i<tokenCount; i++)
{
if (!token[i]->terminal)
{
result++;
}
}
return result;
}
int Grammar::terminalCount()
{
int result=0;
for (int i=0; i<tokenCount; i++)
{
if (token[i]->terminal)
{
result++;
}
}
return result;
}
//leave the rule unremoved!!! as I have no way to do it!!!
void Grammar::removeToken(int tIndex)
{
delete[]token[tIndex]->name;
delete token[tIndex];
tokenCount--;
for (int i=tIndex; i<tokenCount; i++)
{
token[i]=token[i+1];
}
}
bool Grammar::isRedundant(int tIndex)
{
int k, theRule, theToken;
for (int i=0; i<tokenCount; i++)
{
for (int j=0; j<token[i]->count; j++)
{
k=0;
theRule=token[i]->production[j];
theToken=production[theRule][k];
while (theToken!=-1)
{
if (theToken==tIndex)
{
return false;
}
k++;
theToken=production[theRule][k];
}
}
}
return true;
}
//what is redundant? except the start variable
//the non-terminal never appears in other rules!
void Grammar::removeRedundant()
{
int tIndex=1;
bool findNew=false;
while (tIndex<tokenCount)
{
findNew=false;
if (!token[tIndex]->terminal)
{
if (isRedundant(tIndex))
{
removeToken(tIndex);
findNew=true;
}
}
if (findNew)
{
tIndex=1;
}
else
{
tIndex++;
}
}
}
void Grammar::printAllRules()
{
/*
cout<<"\nnow print all rules\n";
for (int i=0; i<=prodIndex; i++)
{
cout<<"rule index "<<i<<": ";
printRule(i);
cout<<"\n";
}
*/
int sum=0;
for (int i=0; i<tokenCount; i++)
{
if (!token[i]->terminal)
{
sum+=token[i]->count;
}
}
cout<<" total rules is:"<<prodIndex+1;
cout<<"\n and the sum is "<<sum<<endl;
}
void Grammar::addBeginEnd()
{
int begin=addToken(startStr, false);
int end=addToken(endStr, true);
prodIndex++;
production[prodIndex][0]=0;
production[prodIndex][1]=end;
production[prodIndex][2]=-1;
addRuleForToken(begin, prodIndex);
startSymbolIndex=begin;
stackBottomIndex=end;
}
/*
void Grammar::calculateProperty()
{
calculateNull();
calculateFirst();
calculateFollow();
}
*/
void Grammar::printToken(bool onlyVar)
{
for (int i=0; i<tokenCount; i++)
{
if (onlyVar)
{
if (token[i]->terminal)
{
continue;
}
}
cout<<"name: "<<token[i]->name<<endl;
cout<<" isNull: "<<(token[i]->isNull?"true":"false")<<endl;
cout<<" isTerminal: "<<(token[i]->terminal?"true":"false")<<endl;
cout<<" first: ";
for (int j=0; j<token[i]->firstCount; j++)
{
if (j!=0)
{
cout<<",";
}
cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name;
}
cout<<"\n follow: ";
for (j=0; j<token[i]->followCount; j++)
{
if (j!=0)
{
cout<<",";
}
cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name;
}
cout<<endl;
}
}
//must use while loop to discover new set!!!!!
void Grammar::calculateFirst()
{
int i;
bool addNew;
do
{
i=0;
addNew=false;
while (i<tokenCount)
{
//the terminal contains itself
if (token[i]->terminal)
{
//addFirst don't judge if it is nullable!!
addFirst(i, i);
//token[i]->first[token[i]->firstCount++]=i;
}
else
{
//for all its rules
for (int j=0; j<token[i]->count; j++)
{
int theToken, k=0;
theToken=production[token[i]->production[j]][k];
//for each token in each rule
do
{
//add non epsilon set
if (addIntoFirst(i, theToken))
{
addNew=true;
}
//if it is not null, means it is end
if (!token[theToken]->isNull)
{
break;
}
//if it is nullable, continue
k++;
theToken=production[token[i]->production[j]][k];
}while (theToken!=-1);
//it means all token in this rule is nullable, so
if (theToken==-1)
{
//it is nullable
//addEpsilonIntoFirst(i);
addFirst(i, emptyIndex);
}
}
}
i++;
}
}while (addNew);
}
bool Grammar::addFollowIntoFollow(int target, int source)
{
bool addNew=false;
for (int i=0; i<token[source]->followCount; i++)
{
if (addFollow(target, token[source]->follow[i]))
{
addNew=true;
}
}
return addNew;
}
bool Grammar::addIntoFollow(int target, int source)
{
bool addNew=false;
if (source==-1)
{
return false;
}
for (int i=0; i<token[source]->firstCount; i++)
{
//add non-epsilon
if (!token[token[source]->first[i]]->isNull)
{
if (addFollow(target, token[source]->first[i]))
{
addNew=true;
}
}
}
return addNew;
}
void Grammar::calculateFollow()
{
int i;
bool addNew, started;
//token[startSymbolIndex]->follow[0]=stackBottomIndex;
//token[startSymbolIndex]->followCount=1;
do
{
i=0;
addNew=false;
while (i<tokenCount)
{
//the terminal contains itself
if (!token[i]->terminal)
{
for (int tIndex=0; tIndex<tokenCount; tIndex++)
{
//for all its rules
if (token[tIndex]->terminal)
{
continue;
}
//for each its rule
for (int j=0; j<token[tIndex]->count; j++)
{
int theToken, k=0, theRule=token[tIndex]->production[j];
started=false;
theToken=production[theRule][k];
//for each token in each rule
do
{
//the token appears here
if (theToken==i)
{
started=true;
//add non epsilon set, including -1 situation!!!
if (addIntoFollow(i, production[theRule][k+1]))
{
addNew=true;
}
}
//if it is not null, means it is end
if (started&&!token[theToken]->isNull)
{
break;
}
//if it is nullable, continue
k++;
theToken=production[theRule][k];
}while (theToken!=-1);
//it means all token in this rule is nullable, so
if (started&&theToken==-1)
{
//it is nullable
//add current variable Follow(j) into Follow(i);
if (addFollowIntoFollow(i, tIndex))
{
addNew=true;
}
}
}
}
}
i++;
}
}while (addNew);
}
void Grammar::addRuleForToken(int tIndex, int ruleIndex)
{
token[tIndex]->production[token[tIndex]->count++]=ruleIndex;
}
void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset)
{
int end=0;
while (production[ruleIndex][end]!=-1)
{
end++;
}
if (left2Right)
{
for (int i=end; i>=0; i--)
{
production[ruleIndex][i+offset]=production[ruleIndex][i];
}
}
else
{
for (int i=0; i<=end-offset; i++)
{
production[ruleIndex][i]=production[ruleIndex][i+offset];
}
checkEpsilon(ruleIndex);
}
}
void Grammar::checkEpsilon(int ruleIndex)
{
if (production[ruleIndex][0]==-1)
{
epsilonRule(ruleIndex);
}
}
bool Grammar::addFollow(int target, int source)
{
//check if it is already there
for (int i=0; i<token[target]->followCount; i++)
{
if (token[target]->follow[i]==source)
{
return false;
}
}
//add at end
token[target]->follow[token[target]->followCount++]=source;
return true;
}
bool Grammar::addFirst(int target, int source)
{
//check if it is already there
for (int i=0; i<token[target]->firstCount; i++)
{
if (token[target]->first[i]==source)
{
return false;
}
}
//add at end
token[target]->first[token[target]->firstCount++]=source;
return true;
}
//add non epsilon into it.
bool Grammar::addIntoFirst(int target, int source)
{
bool addNew=false;
if (token[source]->terminal)
{
return addFirst(target, source);
}
for (int i=0; i<token[source]->firstCount; i++)
{
//add non epsilon
if (!token[token[source]->first[i]]->isNull)
{
if (addFirst(target, token[source]->first[i]))
{
addNew=true;
}
}
}
return addNew;
}
bool Grammar::Null(int tIndex)
{
if (token[tIndex]->terminal)
{
return token[tIndex]->isNull;
}
for (int i=0; i<token[tIndex]->count; i++)
{
int j=0, theToken;
theToken=production[token[tIndex]->production[i]][j];
do
{
if (theToken==tIndex||!Null(theToken))
{
break;
}
j++;
theToken=production[token[tIndex]->production[i]][j];
}while (theToken!=-1);
if (theToken==-1)
{
return true;
}
}
return false;
}
void Grammar::calculateNull()
{
for (int i=0; i<tokenCount; i++)
{
token[i]->isNull=Null(i);
}
}
int Grammar::findMetaSymbol(int& begin, int& end)
{
int theRule, theToken, k;
begin=end=-1;
for (int i=0; i<tokenCount; i++)
{
for (int j=0; j<token[i]->count; j++)
{
k=0;
theRule=token[i]->production[j];
theToken=production[theRule][k];
while (theToken!=-1)
{
if (theToken==beginIndex)
{
begin=k;
}
if (theToken==endIndex)
{
end=k;
return theRule;
}
k++;
theToken=production[theRule][k];
}
}
}
return -1;
}
void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end)
{
int newTokenIndex=forgeToken(0), i=0;
addRuleForToken(newTokenIndex, ++prodIndex);
//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
//token[newTokenIndex]->terminal=false;
copyRule(ruleIndex, prodIndex);
//shrink
while (production[ruleIndex][i+end+1]!=-1)
{
production[ruleIndex][begin+i]=production[ruleIndex][i+end+1];
i++;
}
production[ruleIndex][begin+i]=-1;
addAtEnd(ruleIndex, newTokenIndex);
/*
production[ruleIndex][begin]=newTokenIndex;
production[ruleIndex][begin+1]=-1;
*/
shiftRule(prodIndex, false, begin+1);
production[prodIndex][end-begin-1]=newTokenIndex;
production[prodIndex][end-begin-1]=-1;
epsilonRule(++prodIndex);
addRuleForToken(newTokenIndex, prodIndex);
}
void Grammar::replaceMetaSymbol()
{
int begin, end, ruleIndex;
while ((ruleIndex=findMetaSymbol(begin, end))!=-1)
{
doReplaceMetaSymbol(ruleIndex, begin, end);
}
/*
for (int i=0; i<tokenCount; i++)
{
//if (token[i]->isMeta)
if (i==beginIndex||i==endIndex)
{
removeToken(i);
}
}
*/
}
void Grammar::addAtEnd(int ruleIndex, int toAdd)
{
int end=0;
while (production[ruleIndex][end]!=-1)
{
end++;
}
production[ruleIndex][end++]=toAdd;
production[ruleIndex][end]=-1;
}
//left-recursion: A -> A a | b | c
//change to: A -> b A' | c A'
// A' -> a A' | empty
void Grammar::removeImmediate(int tIndex, int ruleIndex)
{
int newIndex=forgeToken(tIndex);
int holdRuleIndex=token[tIndex]->production[ruleIndex];
//sequence matters!
//change to: A -> b A'
for (int i=0; i<token[tIndex]->count; i++)
{
if (i!=ruleIndex)
{
addAtEnd(token[tIndex]->production[i], newIndex);
}
}
//shift
removeRuleFromToken(tIndex, ruleIndex);
addRuleForToken(newIndex, holdRuleIndex);
//token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex;
shiftRule(holdRuleIndex, false, 1);
addAtEnd(holdRuleIndex, newIndex);
//add epsilon rule for new variable
epsilonRule(++prodIndex);
addRuleForToken(newIndex, prodIndex);
}
int Grammar::forgeToken(int index)
{
char str[MaxTokenLength+2], ch;
int len=strlen(token[index]->name);
int temp=0, i=0;
strcpy(str, token[index]->name);
ch=str[len-1];//get last char of name
if (ch>='0'&&ch<'9')
{
str[len-1]=ch+i+1;
}
else
{
str[len]='0'+i;
str[len+1]='\0';
}
//I won't bother to check repetitation of name
while (tokenIndex(str)!=-1)
{
i++;
if (ch>='0'&&ch<'9')
{
str[len-1]=ch+i+1;
}
else
{
str[len]='0'+i;
str[len+1]='\0';
}
}
return addToken(str, false);//it is non-terminal for sure
}
int Grammar::copyRule(int source, int target)
{
int i=0;
while (production[source][i]!=-1)
{
production[target][i]=production[source][i];
i++;
}
production[target][i]=-1;
return i;
}
void Grammar::doAddRule(int tIndex)
{
token[tIndex]->production[token[tIndex]->count++]=++prodIndex;
}
void Grammar::addRule(const char* str)
{
int index;
index=addToken(str);
production[prodIndex][curPos++]=index;
production[prodIndex][curPos]=-1;//set end
}
//if the token is already there, it return the index
//otherwise, it add new node in token array
int Grammar::addToken(const char* str, bool isTerminal)
{
int index;
index=tokenIndex(str);
if (index==-1)
{
index=tokenCount;
}
else
{
return index;
}
token[index]=createToken(str, isTerminal);
if (strcmp(str, optionBegin)==0)
{
beginIndex=index;
}
if (strcmp(str, optionEnd)==0)
{
endIndex=index;
}
tokenCount++;
if (strcmp(str, emptyStr)==0)
{
token[index]->isNull=true;
emptyIndex=index;
}
return index;
}
void Grammar::newRule(const char* str)
{
if (str!=NULL)
{
curIndex=addToken(str, false);
}
//add one pointer
token[curIndex]->production[token[curIndex]->count++]=++prodIndex;
token[curIndex]->terminal=false;
curPos=0;//reset to restart;
}
Token* Grammar::createToken(const char* theName, bool isTerminal)
{
Token* ptr=new Token;
ptr->name=new char[strlen(theName)+1];
strcpy(ptr->name, theName);
ptr->terminal=isTerminal;
ptr->count=ptr->firstCount=ptr->followCount=0;
ptr->isNull=false;
return ptr;
}
int Grammar::tokenIndex(const char* str)
{
for (int i=0; i<tokenCount; i++)
{
if (strcmp(str, token[i]->name)==0)
{
return i;
}
}
return -1;
}
int Grammar::checkRecursion(int tIndex, int curToken)
{
for (int i=0; i<token[curToken]->count; i++)
{
//token[tIndex]->production[i]=ruleIndex
//production[ruleIndex][0] is the first left-recursion one
if (production[token[curToken]->production[i]][0]<=curToken)
{
return i;
}
}
return -1;
}
void Grammar::printRule(int index)
{
int nodeIndex=0;
cout<<" ~"<<index<<"~ ";
while (production[index][nodeIndex]!=-1)
{
cout<<token[production[index][nodeIndex]]->name<<" ";
nodeIndex++;
}
}
void Grammar::initialize()
{
tokenCount=curIndex=curPos=0;
prodIndex=-1;//in order to ++ blindly
}
Grammar::Grammar()
{
initialize();
}
void Grammar::removeLeftRecursion()
{
int tIndex=0, curToken=0;
bool newChange=false;
while (tIndex<tokenCount)
{
if (!token[tIndex]->terminal)
{
for (int i=0; i<token[tIndex]->count; i++)
{
curToken=production[token[tIndex]->production[i]][0];
if (curToken<=tIndex&&!token[curToken]->terminal)
{
if (curToken!=tIndex)
{
replaceRule(tIndex, i);
}
else
{
removeImmediate(tIndex, i);
}
newChange=true;
}
}
}
//whenever there is some new findings, restart
if (!newChange)
{
tIndex++;
}
else
{
tIndex=0;
newChange=false;
}
}
}
void Grammar::replaceRule(int tIndex, int ruleIndex)
{
int pos, j, targetEnd, sourceEnd, curRule;
curRule=token[tIndex]->production[ruleIndex];
int curToken=production[curRule][0];
for (int i=token[curToken]->count-1; i>=0; i--)
{
if (i>0)
{
doAddRule(tIndex);
pos=copyRule(token[curToken]->production[i], prodIndex);
j=0;
while (production[curRule][j+1]!=-1)
{
production[prodIndex][pos+j]=production[curRule][j+1];
j++;
}
production[prodIndex][pos+j]=-1;
//addRuleForToken(curToken, prodIndex);
}
else
{
targetEnd=sourceEnd=0;
//curRule=token[tIndex]->production[ruleIndex];
while (true)
{
if (production[token[curToken]->production[0]][sourceEnd]==-1&&
production[curRule][targetEnd]==-1)
{
break;
}
if (production[token[curToken]->production[0]][sourceEnd]!=-1)
{
sourceEnd++;
}
if (production[curRule][targetEnd]!=-1)
{
targetEnd++;
}
}
j=targetEnd+sourceEnd-1;
while (j>=0)
{
if (j>=sourceEnd)
{
production[curRule][j]=production[curRule][j-sourceEnd+1];
}
else
{
production[curRule][j]=production[token[curToken]->production[0]][j];
}
j--;
}
}
}
}
//optimize sequence is first common-factor then remove left recursion
//therefore I don't have to check if for same variable if there will be
//more than one left-recursion
void Grammar::optimize()
{
replaceMetaSymbol();
removeLeftRecursion();
commonLeftFactor();
//removeRedundant();
addBeginEnd();
calculateNull();
calculateFirst();
calculateFollow();
}
int Grammar::findCommonFactor(int tIndex, int ruleIndex)
{
for (int i=ruleIndex+1; i<token[tIndex]->count; i++)
{
//if the two rule has the same first token
if (production[token[tIndex]->production[ruleIndex]][0]==
production[token[tIndex]->production[i]][0])
{
/*
//calculate if there is epsilon
if (emptyIndex==-1)
{
emptyIndex=tokenIndex(emptyStr);
}
//if it is not epsilon
if (production[token[tIndex]->production[i]][0]!=emptyIndex)
{
return i;
}
*/
return i;
}
}
return -1;
}
void Grammar::epsilonRule(int ruleIndex)
{
production[ruleIndex][0]=addToken(emptyStr);
production[ruleIndex][1]=-1;
}
//sample: x -> Aa
// x -> Ab
//changed to: x -> Ax' //this is to change the old rule
// x' -> b //this is to change the old rule
// x' -> a //this is the new-added rule
void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index)
{
int newTokenIndex=forgeToken(tIndex);//create a new variable
//move the second and after part to the new rule of new variable
//doing: x' -> a
//curPos=0;
//prepare to add one new rule
addRuleForToken(newTokenIndex, ++prodIndex);
//token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex;
token[newTokenIndex]->terminal=false;
copyRule(token[tIndex]->production[ruleIndex], prodIndex);
shiftRule(prodIndex, false, 1);
/*
do
{
//do copying
production[prodIndex][curPos]=
production[token[tIndex]->production[ruleIndex]][curPos+1];
curPos++;
//even the -1 at end is copied
}while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1);
*/
//in order to show an empty string, in case the string is "epsilon"
/*
if (curPos==1)
{
epsilonRule(prodIndex);
}
*/
//replace x -> Aa with x -> Ax'
production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex;
production[token[tIndex]->production[ruleIndex]][2]=-1;//end
//doing: x' -> b
//curPos=0;
//prepare to add one new rule
//pointing new token to where old rule lies
addRuleForToken(newTokenIndex, token[tIndex]->production[index]);
/*
token[newTokenIndex]->production[token[newTokenIndex]->count++]=
token[tIndex]->production[index];
*/
shiftRule(token[tIndex]->production[index], false, 1);
removeRuleFromToken(tIndex, index);
}
void Grammar::removeRuleFromToken(int tIndex, int ruleIndex)
{
token[tIndex]->count--;
for (int i=ruleIndex; i<token[tIndex]->count; i++)
{
token[tIndex]->production[i]=token[tIndex]->production[i+1];
}
}
void Grammar::commonLeftFactor()
{
int index=-1, tIndex=0, ruleIndex=0;
bool flag;
//whenever a newrule is done, restart!
while (tIndex<tokenCount)
{
ruleIndex=0;
flag=false;
while (ruleIndex<token[tIndex]->count)
{
index=findCommonFactor(tIndex, ruleIndex);
if (index!=-1)
{
doCommonFactor(tIndex, ruleIndex, index);
//restart
flag=true;
break;
}
else
{
ruleIndex++;
}
}
if (flag)
{
tIndex=0;
}
else
{
tIndex++;
}
}
}
Token* Grammar::operator[](int index)
{
if (index>=0&&index<tokenCount)
{
return token[index];
}
else
{
return NULL;
}
}
void Grammar::print()
{
for (int i=0; i<tokenCount; i++)
{
if (!token[i]->terminal)
{
cout<<token[i]->name<<" ==> ";
for (int j=0; j<token[i]->count; j++)
{
printRule(token[i]->production[j]);
if (j!=token[i]->count-1)
{
cout<<" | ";
}
}
cout<<"\n";
}
}
}
file name: main.cpp (main)
#include <iostream>
#include "CFGReader.h"
using namespace std;
int main()
{
CFGReader R;
R.readFromFile("c:\\ruleTest.txt");
R.optimize();
R.print();
return 0;
}
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->" and "{", "}" is meta symbol
equivalent to Kaleen star.
M -> program i ; Dl B
Dl -> Dv Ml
Ml -> Ml Mo | e
Mo -> module i ( Vl ) Dv B
Dv -> variables Vl | e
Vl -> Vl V | V
V -> Il : T ;
T -> integer Ad ; | char Ad
Il -> i | Il , i
L -> i Ar
Ad -> e | array [ n ]
B -> begin Sl end ;
Sl -> S | S Sl
S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B | read Ln ; | write Lo ; | e ;
Lp -> e | Ln
Ln -> L { , L }
Lo -> Lr { , Lr }
Lr -> i Ar | n | c
Ar -> e | [ E ]
E -> F { Oa F }
Oa -> + | -
F -> R { Om R }
Om -> * | /
R -> L | n | ( E ) | c
C -> E Or E
Or -> = | < | > | <= | >= | !=
Here is the result:
M ==> ~0~ program i ; Dl B Dl ==> ~1~ Dv Ml B ==> ~17~ begin Sl end ; Dv ==> ~5~ variables Vl | ~6~ e Ml ==> ~3~ e Ml0 Mo ==> ~4~ module i ( Vl ) Dv B Vl ==> ~8~ V Vl0 V ==> ~9~ Il : T ; Il ==> ~12~ i Il0 T ==> ~10~ integer Ad ; | ~11~ char Ad Ad ==> ~15~ e | ~16~ array [ n ] L ==> ~14~ i Ar Ar ==> ~36~ e | ~37~ [ E ] Sl ==> ~18~ S Sl0 S ==> ~20~ i S0 | ~21~ if C then S else S | ~22~ loop Sl end ; | ~23~ exit ; | ~25~ begin S l end ; | ~26~ read Ln ; | ~27~ write Lo ; | ~28~ e ; E ==> ~38~ F M0 C ==> ~48~ F M0 Or E Lp ==> ~29~ e | ~30~ Ln Ln ==> ~31~ i Ar M1 Lo ==> ~32~ Lr M2 Lr ==> ~33~ i Ar | ~34~ n | ~35~ c F ==> ~41~ R M3 Oa ==> ~39~ + | ~40~ - R ==> ~44~ i Ar | ~45~ n | ~46~ ( E ) | ~47~ c Om ==> ~42~ * | ~43~ / Or ==> ~49~ = | ~50~ < | ~51~ > | ~52~ <= | ~53~ >= | ~54~ != M0 ==> ~55~ + F | ~56~ e | ~66~ - F M1 ==> ~57~ , L | ~58~ e M2 ==> ~59~ , Lr | ~60~ e M3 ==> ~61~ * R | ~62~ e | ~67~ / R Ml0 ==> ~2~ module i ( Vl ) Dv B Ml0 | ~63~ e Vl0 ==> ~7~ i Il0 : T ; Vl0 | ~64~ e Il0 ==> ~13~ , i Il0 | ~65~ e Sl0 ==> ~68~ e | ~19~ Sl S0 ==> ~69~ Ar := E ; | ~24~ ( Lp ) ; START ==> ~70~ M $ name: M isNull: false isTerminal: false first: [0]=program follow: [0]=$ name: Dl isNull: true isTerminal: false first: [0]=e,[1]=variables,[2]=module follow: [0]=begin name: B isNull: false isTerminal: false first: [0]=begin follow: [0]=module name: Dv isNull: true isTerminal: false first: [0]=variables,[1]=e follow: [0]=module,[1]=begin name: Ml isNull: true isTerminal: false first: [0]=e,[1]=module follow: [0]=begin name: Mo isNull: false isTerminal: false first: [0]=module follow: name: Vl isNull: false isTerminal: false first: [0]=i follow: [0]=) name: V isNull: false isTerminal: false first: [0]=i follow: [0]=i name: Il isNull: false isTerminal: false first: [0]=i follow: [0]=: name: T isNull: false isTerminal: false first: [0]=integer,[1]=char follow: [0]=; name: Ad isNull: true isTerminal: false first: [0]=e,[1]=array follow: [0]=; name: L isNull: false isTerminal: false first: [0]=i follow: name: Ar isNull: true isTerminal: false first: [0]=e,[1]=[ follow: [0]=,,[1]=:=,[2]=;,[3]=*,[4]=/ name: Sl isNull: false isTerminal: false first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=; follow: [0]=end name: S isNull: false isTerminal: false first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=e,[8]=; follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=else name: E isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=],[1]=),[2]=; name: C isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=then name: Lp isNull: true isTerminal: false first: [0]=e,[1]=i follow: [0]=) name: Ln isNull: false isTerminal: false first: [0]=i follow: [0]=; name: Lo isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=c follow: [0]=; name: Lr isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=c follow: [0]=, name: F isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=+,[1]=- name: Oa isNull: false isTerminal: false first: [0]=+,[1]=- follow: name: R isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=*,[1]=/ name: Om isNull: false isTerminal: false first: [0]=*,[1]=/ follow: name: Or isNull: false isTerminal: false first: [0]==,[1]=<,[2]=>,[3]=<=,[4]=>=,[5]=!= follow: [0]=i,[1]=n,[2]=(,[3]=c name: M0 isNull: true isTerminal: false first: [0]=+,[1]=e,[2]=- follow: [0]=],[1]=),[2]=;,[3]==,[4]=<,[5]=>,[6]=<=,[7]=>=,[8]=!= name: M1 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=; name: M2 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=; name: M3 isNull: true isTerminal: false first: [0]=*,[1]=e,[2]=/ follow: [0]=+,[1]=- name: Ml0 isNull: true isTerminal: false first: [0]=module,[1]=e follow: [0]=begin name: Vl0 isNull: true isTerminal: false first: [0]=i,[1]=e follow: [0]=) name: Il0 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=: name: Sl0 isNull: true isTerminal: false first: [0]=e,[1]=i,[2]=if,[3]=loop,[4]=exit,[5]=begin,[6]=read,[7]=write,[8]=; follow: [0]=end name: S0 isNull: false isTerminal: false first: [0]=[,[1]=:=,[2]=( follow: name: START isNull: false isTerminal: false first: [0]=program follow: M:program=0, Dl:e=1,module=1,variables=1,begin=1, B:begin=17, Dv:e=6,module=6,variables=5,begin=6, Ml:e=3,module=3,begin=3, Mo:module=4, Vl:i=8, V:i=9, Il:i=12, T:integer=10,char=11, Ad:;=15,e=15,array=16, L:i=14, Ar:;=36,e=36,,=36,[=37,:==36,*=36,/=36, Sl:i=18,;=18,e=18,begin=18,if=18,loop=18,exit=18,read=18,write=18, S:i=20,;=28,e=28,begin=25,if=21,loop=22,exit=23,read=26,write=27, E:i=38,(=38,n=38,c=38, C:i=48,(=48,n=48,c=48, Lp:i=30,e=29,)=29, Ln:i=31, Lo:i=32,n=32,c=32, Lr:i=33,n=34,c=35, F:i=41,(=41,n=41,c=41, Oa:+=39,-=40, R:i=44,(=46,n=45,c=47, Om:*=42,/=43, Or:==49,<=50,>=51,<==52,>==53,!==54, M0:;=56,e=56,)=56,]=56,+=55,-=66,==56,<=56,>=56,<==56,>==56,!==56, M1:;=58,e=58,,=57, M2:;=60,e=60,,=59, M3:e=62,+=62,-=62,*=61,/=67, Ml0:e=63,module=2,begin=63, Vl0:i=7,e=64,)=64, Il0:e=65,:=65,,=13, Sl0:i=19,;=19,e=68,begin=19,end=68,if=19,loop=19,exit=19,read=19,write=19, S0:e=69,(=24,[=69,:==69, START:program=70, Press any key to continue