Compression-Project/src/encode.cpp

#include "common.h"
#include "huffman.h"
#define pb push_back


struct weightstruct
{
	int word;
	int clause;
	int sentence;
	weightstruct(int a=0,int b=0,int c=0):word(a),clause(b),sentence(c){}
	weightstruct operator + (const weightstruct& w) const
	{
		return weightstruct(word+w.word,clause+w.clause,sentence+w.sentence);
	}
};
map<string,int> wordlist;
set<string> adjlist;
void getlists()
{
  ifstream listfile("fullist");
  for(int i=0;i<500;i++)
  {
    string a;int b;
    listfile>>a>>b;
    wordlist[a]=b;
  }
  ifstream listfileadj("adjlist");
  for(int i=0;i<227;i++)
  {
    string a;
    listfileadj>>a;
    adjlist.insert(a);
  }
}

weightstruct getweight(string s)
{
	if(s[0]=='$')
		return weightstruct(200,50,10);
	if(s=="one"||s=="two"||s=="three"||s=="four"||s=="five"||s=="six"||s=="seven"||s=="eight"||s=="nine")return weightstruct(100,25,10);
	if(s=="example")return weightstruct(-300,-200,-100);


  if(s=="spi")
    return weightstruct(-100,-10,-200);
  if(s=="ipf")
    return weightstruct(-100,30,50);
  if(s=="please") return weightstruct(-300,0,0);
  if(adjlist.find(s)!=adjlist.end())
  {
    return weightstruct(-100,-10,-10);
  }
  if(wordlist.find(s)!=wordlist.end())
  {
    return weightstruct(-wordlist[s]/100000,-10,-5);
  }

  /*
	if(s=="a"||s=="is"||s=="are"||s=="to"||s=="so"||s=="as") return weightstruct(-200,0,0);
	if(s=="the")
		return weightstruct(-200,0,0);
/*
	if(s.length()==1)
		return weightstruct(-300,0,0);
	if(s.length()==2)
		return weightstruct(-50,0,0);
*/
	return weightstruct(-10,0,0);
}
void preprocessword(int stage,string& s,string& next,int pos)
{
  if(s.length()<2)return;
	if(stage==-1)//after deleting low weights
	{
		if(s[0]=='$')
			s=s.substr(1,s.length()-2);
    if(s.length()>5)
    {
      string tmp;
      for(char x:s)
        if(x!='a'&&x!='e'&&x!='i'&&x!='o'&&x!='u')
          tmp+=x;
      s=tmp;
    }
	}
	if(stage==0)
	{
		string tmp;
		for(char c:s)
			tmp+=('A'<=c&&c<='Z')?(c-'A'+'a'):(c);
		s=tmp;
	}
	if(stage==1)
	{
    if(s=="sample"&&next=="input")
      s="spi",next="";
    if(s=="sample"&&next=="output")
      s="spo",next="";
    if(s=="input"&&next=="format")
      s="ipf",next="";
    if(s=="output"&&next=="format")
      s="opf",next="";

    if(s=="input")
      s="ipt";
    if(s=="output")
      s="opt";
		if(s=="and")
			s=",";
		if(s.size()>=2)
			if(s.substr(s.size()-2)=="'s")
				s=s.substr(0,s.size()-2);
	}
	if(stage==2)
	{
		if(s=="farmer"&&next=="john")
		{
			s="fj";
			next="";
		}
	}
}

struct clause
{
	vector<string> words;
	vector<weightstruct> weights;
	string s;
	weightstruct weight;
	void parse()
	{
		string word="";
		bool iseq=false;
		for(int j=0;j<s.length();j++)
		{
			if(s.at(j)=='$')
			{
				if(iseq)
				{
					word+="$";
					words.pb(word);
					word="";
					iseq=false;
				}
				else
				{
					if(word!="") words.pb(word);
					word="$";
					iseq=true;
				}
			}
			else
			{
				if(s.at(j)!=' '&&s.at(j)!='\"'&&s.at(j)!='\"')
					word+=(s.at(j));
				//if(('a'<=s.at(j)&&s.at(j)<='z')||('A'<=s.at(j)&&s.at(j)<='Z')||('0'<=s.at(j)&&s.at(j)<='9')||(iseq&&s.at(j)!=' '))
				//  word+=('A'<=s.at(j)&&s.at(j)<='Z')?(s.at(j)-'A'+'a'):(s.at(j));
				else
				{
					if(!iseq)
					{
						if(word!="")words.pb(word);
						word="";
					}
				}
			}
		}
		if(word!="") words.pb(word);
	}
	void calcweight()
	{
		weights.clear();
		for(auto& w: words)weights.pb(getweight(w));
		for(auto& w: weights) weight=weight+w;
	}
	void preprocess(int stage)
	{
		for(int i=0;i<words.size();i++)
		{
			string e="";
			string* nx=&e;
			if(i+1<words.size())
				nx=&words[i+1];
			preprocessword(stage,words[i],*nx,i);
		}
	}
	string gettext(int minw)
	{
		bool iss=true;
		string ans="";
		for(int i=0;i<words.size();i++)
			if(weights[i].word>=minw)
			{
				if(iss)
				{
					iss=false;
					ans+=words[i];
				}
				else ans+=" "+words[i];
			}
		return ans;
	}
};
struct sentence
{
	vector<clause> clauses;
	string s;
	weightstruct weight;
	void parse()
	{
		string cl="";
		bool iseq=false;
		for(int j=0;j<s.length();j++)
		{
			if(s.at(j)=='$') iseq=!iseq;
			if((s.at(j)!=','&&s.at(j)!='('&&s.at(j)!=')'&&s.at(j)!=';'&&s.at(j)!=':')|| iseq) cl+=s.at(j);
			else
			{
				clause tmp;
				tmp.s=cl;clauses.pb(tmp);cl="";
			}
		}
		clause tmp;
		tmp.s=cl;clauses.pb(tmp);
		for(auto& c:clauses) c.parse();
		int numclause=0;
		for(int i=0;i<clauses.size();i++)
			if(clauses[i].words.size()!=0)
			{
				auto tmp=clauses[i];
				clauses[numclause]=tmp;
				numclause++;
			}
		clauses.resize(numclause);
	}
	void calcweight()
	{
		weight=weightstruct();
		for(auto& c : clauses)
		{
			c.calcweight();
			weight=weight+c.weight;
		}
	}
	void preprocess(int stage)
	{
		for(auto& c:clauses)
			c.preprocess(stage);
	}
	string gettext(int minw)
	{
		bool iss=true;
		string ans="";
		for(auto& c:clauses)
			if(c.weight.clause>=minw)
			{
				string tmp=c.gettext(minw);
				if(tmp.length()>0)
				{
					if(iss)
					{
						iss=false;
						ans=ans+tmp;
					}
					else ans+=","+tmp;
				}

			}
		return ans;
	}
};
struct file
{
	vector<sentence> text;
	void init(string tx)
	{
		string line;
		stringstream stin(tx);
		string sentencefile="";
		bool iseq=false;
		while(getline(stin,line))
		{
			if(line=="INPUT FORMAT")
			{
				sentence tmp;tmp.s=sentencefile;text.pb(tmp);
				sentencefile=line;
				while(getline(stin,line)&&line!="SAMPLE INPUT")
				{
					sentencefile+=' '+line;
				}
				sentence tmp2;tmp2.s=sentencefile;text.pb(tmp2);
				sentencefile="";
			}
			if(line=="SAMPLE INPUT")
			{
				sentence tmp;tmp.s=sentencefile;text.pb(tmp);
				sentencefile=line;
				while(getline(stin,line))
				{
					sentencefile+=' '+line;
				}
				sentence tmp2;tmp2.s=sentencefile;text.pb(tmp2);
				sentencefile="";
			}
			for(int i=0;i<line.length();i++)
			{
				if(line.at(i)=='$') iseq=!iseq;
				if((line.at(i)!='.'&&line.at(i)!='?'&&line.at(i)!='!')|| iseq) sentencefile+=line.at(i);
				else
				{
					sentence tmp;tmp.s=sentencefile;text.pb(tmp);
					sentencefile="";
				}
			}
			sentencefile+=' ';
		}
		sentence tmp;tmp.s=sentencefile;text.pb(tmp);
		for(auto& i:text)i.parse();
	}
	void preprocess(int stage)
	{
		for(auto&x : text)
			x.preprocess(stage);
	}
	void calcweight()
	{
		for(auto& x:text)
			x.calcweight();
	}
	string gettextstring(int minw)
	{
		bool iss=true;
		string ans="";
		for(auto& c:text)
			if(c.weight.sentence>=minw)
			{
				string tmp=c.gettext(minw);
				if(tmp.length()>0)
				{
					if(iss)
					{
						iss=false;
						ans=ans+tmp;
					}
					else ans+="."+tmp;
				}

			}
		return ans;
	}
};

bool invalidChar (char c)
{
    return !(c>=0 && c <128);
}
void stripUnicode(string & str)
{
    str.erase(remove_if(str.begin(),str.end(), invalidChar), str.end());
}

string huffmancompress(string s)
{
  if (s.size() < 2) return "";
  vector<bool> enc = huffman::encode(s);
  ofstream cout("output");
  string ans;
  for (int i = 0; i < (int)enc.size(); i += 8) {
    char out = 0;
    for (int j = i; j < min(i + 8, (int)enc.size()); ++j) {
      if (enc[j]) out += (1 << (j - i));
    }
    ans += out;
  }
  cout << ans;
  return ans;
}

int huffmancompresslength(string s)
{
	if (s.size() < 2) return 0;
  vector<bool> enc = huffman::encode(s);
  return (enc.size() + 7) / 8;
}

int initsize=0;
int compressrate;
int targetsize;

int main(int argc, char *argv[]) {
  getlists();
	string input="";

  string input_file = argv[1];
  //cin >> input_file;

	string line;
	compressrate = atoi(argv[2]);
	ifstream stin(input_file);
  compressrate = 100 - compressrate;

	while(getline(stin,line))
	{
		input+=line;
		input+='\n';
	}
  stripUnicode(input);
	initsize=input.length()-1;//remove last newline

  for(char& x:input)
    if(x=='\r' || x == '\n')
      x=' ';

	file orig;
	orig.init(input);

	file final;
	final=orig;
	for(int i=0;i<5;i++)
	{
		final.preprocess(i);
	}
	targetsize=((compressrate)*initsize+99)/100;

	final.calcweight();
	int a=-10000;int b=10000;//(a,b]
	while(a+1<b)
	{
		int c=(a+b)/2;
		file tmp;

		tmp.init(final.gettextstring(c));
		tmp.calcweight();
		tmp.preprocess(-1);
    string s = tmp.gettextstring(-100000);
		int sz= (s == "" ? 0 : huffmancompresslength(s));
		if(sz>targetsize)
		{
			a=c;
		}
		else
		{
			b=c;
		}
	}
	file output;
	output.init(final.gettextstring(b));
	output.calcweight();
	output.preprocess(-1);
	string s = output.gettextstring(-100000);
  cout << s << '\n';

  //where can i find a library of words in categories can't find any
  // not really sure
  // im not getting anywhere with weighting lol
  // there is no list i can find


  //!!! are we counting input size correctly bc the \r issue?
  s = huffmancompress(s);
  int orig_size = 0;
  ifstream testin(input_file);
  char c;
  while (testin >> c) ++orig_size;
  while (100 * s.size() > compressrate * orig_size) s.pop_back();
  cout << "Original length: " << orig_size << '\n';
  cout << "Compressed length: " << s.size() << '\n';
  cout << "Percent compression: " << 100.0 - ((double)100.0 * s.size() / orig_size) << "%\n";
  cout << s << '\n';

	// WARNING: Huffman will CRASH if you pass a string with only one unique character
	/*string orig = "test1234";
  vector<bool> enc = huffman::encode(orig);
  string dec = huffman::decode(enc);
  if (orig != dec) {
	  cout << "Not a match!\n";
  }
  else {
    cout << dec << '\n';
    for (auto b : enc) cout << b;
    cout << '\n';
	cout << "Original length: " << orig.size() << '\n';
    cout << "Compressed length: " << (enc.size() + 7) / 8 << '\n';
	cout << "Percent compression: " << 100.0 - (double)100.0 * (enc.size() + 7) / 8 / orig.size() << "%\n";
  }*/
}