huffman算法---文件压缩
huffman算法---文件压缩2016-10-30
#pragma once #include<cassert> #include<vector> using namespace std; template<typename T> struct Less { bool operator()(const T& l, const T& r) { return l < r; } }; template<typename T> struct Great { bool operator()(const T& l, const T& r) { return l>r; } }; //通过仿函数,可以建最小堆也可以建最大堆 template<typename T, class Compare = Less<T>> class Heap { public: Heap() {} Heap(T *a, int size) { _a.reserve(size); for (int i = 0; i < size; i++) { _a.push_back(a[i]); } //建堆 for (int i = (size - 2) / 2; i >= 0; --i) //从最后一个非叶结点开始调整 { AdjustDown(i, size); } } void Push(const T& x) { //插入到尾部,再从最后一个元素开始向上调整 _a.push_back(x); AdjustUp(_a.size() - 1); } void Pop() { //将堆顶与最后一个元素交换,再从堆顶下滑调整 assert(!_a.empty()); swap(_a[0], _a[_a.size() - 1]); _a.pop_back(); if (_a.size()>1) //如果堆中的元素大于一个,再进行调整 { AdjustDown(0, _a.size()); } } size_t Size() { return _a.size(); } bool Empty() { return _a.empty(); } const T& Top() { assert(!_a.empty()); return _a[0]; } protected: void AdjustDown(int root, int size) //向下调整算法 { assert(!_a.empty()); int parent = root; int child = parent * 2 + 1; while (child<size) { if ((child + 1) < size &&Compare()(_a[child + 1], _a[child])) //找左右孩子中最大的下标 child++; if (Compare()(_a[child], _a[parent])) { swap(_a[parent], _a[child]); parent = child; child = parent * 2 + 1; } else { break; } } } void AdjustUp(int child) //向上调整算法 { assert(!_a.empty()); while (child>0) { int parent = (child - 1) / 2; if (Compare()(_a[child], _a[parent])) { swap(_a[parent], _a[child]); child = parent; } else { break; } } } private: vector<T> _a; };
#pragma once #include"heap.h" template<typename T> struct HuffmanNode { T _data; HuffmanNode<T>* _left; HuffmanNode<T>* _right; HuffmanNode<T>* _parent; HuffmanNode(const T& data = T()) :_data(data) , _left(NULL) , _right(NULL) , _parent(NULL) {} }; template<typename T> class HuffmanTree { typedef HuffmanNode<T> Node; public: HuffmanTree() :_root(NULL) {} HuffmanTree(T *a, size_t size,const T& invalid=T()) { //最小堆的比较方式 struct NodeLess { bool operator()(Node* l, Node* r) { assert(l); assert(r); return l->_data < r->_data; } }; //将这组数据建成一个最小堆,堆中元素的类型是Node*,这是为了保存后面结点的父节点 Heap<Node*, NodeLess> minHeap; for (size_t i = 0; i <size; i++) { if (a[i]._count!=invalid._count) //如果字符出现的次数不为0,就加入堆中 { Node* _node = new Node(a[i]); minHeap.Push(_node); } } //用huffman算法,从堆里面取出最小的两个结点并删除,将这两个结点构成一棵树在插入堆中 Node* frist = NULL; Node* second = NULL; Node* parent = NULL; while (minHeap.Size()>1) { frist = minHeap.Top(); minHeap.Pop(); second = minHeap.Top(); minHeap.Pop(); parent = new Node(frist->_data + second->_data); parent->_left = frist; parent->_right = second; frist->_parent = parent; second->_parent = parent; minHeap.Push(parent); } //堆里面的最后一个就是Huffman树的根节点 _root = minHeap.Top(); } Node* GetRoot() { return _root; } ~HuffmanTree() { if (_root != NULL) { Destory(_root); } } protected: void Destory(Node * root) { if (root == NULL) return; Node* cur = root; Destory(cur->_left); Destory(cur->_right); delete cur; cur = NULL; } private: Node* _root; };
#pragma once #include<string> #include<cstdio> #include<cassert> #include<cstdlib> #include<algorithm> using namespace std; #include"HuffmanTree.h" typedef long long LongType; struct CharInfo { unsigned char _data; LongType _count; //保存字符出现的次数 string _Code; //保存Huffman编码 CharInfo(LongType count=0) :_count(count) {} CharInfo operator+(CharInfo& ch) { return CharInfo(_count+ch._count); } bool operator<(CharInfo &ch) { return _count < ch._count; } }; class HuffFileCompress { public: HuffFileCompress() { for (int i = 0; i < 256; i++) //将Infos中每个data初始化相应的字符 { _Infos[i]._data = i; } } HuffFileCompress(const char * filename) { assert(filename != NULL); for (int i = 0; i < 256; i++) //将Infos中每个data初始化相应的字符 { _Infos[i]._data = i; } FILE *fInput = fopen(filename, "rb"); //打开文件 assert(fInput); int ch = 0; while ((ch = fgetc(fInput)) != EOF) //读文件,统计字符出现次数 { _Infos[ch]._count++; } //构建huffman树 CharInfo invalid; HuffmanTree<CharInfo> ht(_Infos, 256, invalid); //得到huffman编码 string str; GetHufCode(ht.GetRoot(), str); fclose(fInput); } //文件压缩 const string CompressFile(const string filename) { assert(!filename.empty()); //得到压缩文件的文件名 string CompressFileName = filename; CompressFileName += ".huf"; FILE *fInput = fopen(filename.c_str(),"rb"); //打开原文件 assert(fInput); FILE *fOut = fopen(CompressFileName.c_str(),"wb"); //打开压缩文件 if (fOut == NULL) { fclose(fOut); exit(EXIT_FAILURE); } //编写配置文件,保存字符以及字符出现的次数,用于解压时重建huffmanTree string configFileName = filename; //配置文件名 configFileName += ".config"; FILE *configOut = fopen(configFileName.c_str(),"wb"); //打开配置文件 assert(configOut); string line; for (int i = 0; i < 256; i++) { if (_Infos[i]._count!=0) //将字符以及出现的次数存在一行 { int c=_Infos[i]._data; fputc(c,configOut); line += ","; char buffer[25] = { 0 }; //将次数转换成字符串存储 line+=_itoa((int)_Infos[i]._count, buffer, 10); line += '\n'; fputs(line.c_str(),configOut); line.clear(); } } fclose(configOut); //关闭配置文件 int c=0; //用来保存huffman编码所构成的字符 int pos =7; //判断处理的位数,如果到8则进行写入 int ch = fgetc(fInput); while (ch!=EOF) { string &code=_Infos[ch]._Code; for (size_t i = 0; i < code.size(); i++) //处理ch这个字符的编码 { c |= ((code[i]-'0') << pos); //从高位开始相或 pos--; if (pos<0) { fputc(c,fOut); pos = 7; c = 0; } } ch = fgetc(fInput); } if (pos<7) //处理最后一个字符编码不是8位的情况 { fputc(c, fOut); } fclose(fOut); fclose(fInput); return CompressFileName; } //解压缩 const string UnCompressFile(string filename) { assert(!filename.empty()); //得到解压缩之后的文件的名字 string name; name= filename; int i = 0; string posfix; for (i =(int)filename.size()-1; i>=0; --i) //找到后缀出现的位置 { posfix.push_back(filename[i]); if (filename[i] == '.') break; } reverse(posfix.begin(),posfix.end()); //让posfix保存要解压文件的后缀 if (posfix != ".huf") //如果要解压的文件不是huffman压缩的,则不能解压 { return NULL; } //去掉后缀 name.resize(i); string UnCompressFileName = name; //得到压缩文件名 UnCompressFileName += ".uhuf"; string configName = name; configName+=".config"; //得到配置文件名 //打开压缩文件 FILE *fInput = fopen(filename.c_str(),"rb"); FILE *fInput2 =fInput; assert(fInput); //打开解压缩文件 FILE *fOut = fopen(UnCompressFileName.c_str(),"wb"); if (fOut == NULL) { fclose(fInput); exit(EXIT_FAILURE); } int ch = 0; //读取配置文件 FILE *configInput = fopen(configName.c_str(),"rb"); //打开配置文件名 string line; int c = 0; while ((c = fgetc(configInput))!=EOF) //读取一行 { GetLine(configInput, line); _Infos[c]._count = atoi((&line[1])); //获得字符出现的次数 line.clear(); } fclose(configInput); //关闭配置文件 //构建huffman树 CharInfo invalid; HuffmanTree<CharInfo> ht(_Infos, 256, invalid); LongType count = ht.GetRoot()->_data._count; //获取字符的总个数 int pos=7; c = 1; HuffmanNode<CharInfo> *root= ht.GetRoot(); //得到huffman树的根节点 HuffmanNode<CharInfo> *cur = root; while (count > 0) //以字符的总个数作为结束标志 { ch = fgetc(fInput); //处理ch的二进制 while (pos >= 0 && count > 0) { //寻找叶子结点(编码所对应的字符) if (ch&(c << pos)) { cur = cur->_right; } else { cur = cur->_left; } if (cur->_left == NULL&&cur->_right == NULL) //找到huffman所对应的字符 { //写文件 //if (cur->_data._data == '\n') // fputc(13, fOut); fputc(cur->_data._data, fOut); cur = root; count--; } pos--; } pos = 7; } fclose(fInput); fclose(fOut); return UnCompressFileName; } protected: //得到huffman编码 void GetHufCode(HuffmanNode<CharInfo>* root, string str) { if (root == NULL) return; if (root->_left == NULL&&root->_right == NULL) { _Infos[root->_data._data]._Code = str; //将huffman编码保存在infos里面 return; } GetHufCode(root->_left, str + '0'); //向左子树走压0 GetHufCode(root->_right, str + '1'); //向右子树走压1 } bool GetLine(FILE*& Input,string &line) //读取一行字符 { assert(Input); int ch = 0; ch = fgetc(Input); if (ch == EOF) return false; while (ch != EOF&&ch != '\n') { line += ch; ch = fgetc(Input); } return true; } private: CharInfo _Infos[256]; };