哈夫曼编码 - Biscuitの赛博小窝

Biscuit

哈基米，你要大步大步地走下去啊，不行，要悠哒悠哒才能欣赏到沿途的风景。

公告

欢迎来到我的博客喵。

了解更多

2579 字

13 分钟

哈夫曼编码

2025-10-17

数据结构与算法

数据结构

/

树

/

哈夫曼编码

哈夫曼编码简介#

哈夫曼编码（Huffman Coding）是一种无损数据压缩算法，由大卫·哈夫曼（David A. Huffman）在1952年提出。它通过使用变长编码来表示数据中的符号，使得出现频率较高的符号使用较短的编码，而出现频率较低的符号使用较长的编码，从而实现数据压缩。

前置知识#

编码：是指把一组对象(如字符集)中的每个对象用唯一的一个二进制位串表示。如ASCII,指令系统
解码：是编码的逆过程,把二进制位串还原成原来的对象。
前缀码：任何一个编码都不是另一个编码的前缀。
1. 前缀码保证了在解码(译码)时的唯一性。
2. 等长编码显然具有前缀性;
3. 变长编码可能不具有前缀性。比如：A->0, B->01, C->010 不是前缀码，因为A的编码0是B和C编码的前缀。
平均编码长度：设每个(对象)字符c 的出现的概率 $j$ 为 $p_j$ ,其二进制位串长度(码长)为 $l_j$ ，则 $\sum l_j \cdot p_j$ 表示该组对象(字符)的平均编码长度。平均编码长度越小,表示编码的压缩效果越好。

哈夫曼树的构造#

哈夫曼树是一种带权路径长度最短的二叉树，用于生成哈夫曼编码。构造哈夫曼树的步骤如下：

初始化：为字符集中每个字符初始化一棵只有叶结点的二叉树（就一个带权节点），叶子的权值为对应字符的使用频率。
计算WPL：定义带权路径长度(WPL,Weighted Path Length)为所有叶子结点的权值与其到根结点路径长度的乘积之和。WPL越小，表示编码的压缩效果越好。 $WPL = \sum_{i=1}^{n} (权值_i \times 路径长度_i)$

核心思想：权值越大的节点越靠近根节点，WPL越小；离根节点越远的权值尽可能小

步骤：

初始化：对于 ${\omega_1, \omega_2, \ldots, \omega_n}$ 为每个字符的权值，构造n个单节点二叉树，每个节点的权值为对应字符的权值。
重复以下步骤直到只剩下一棵树：
1. 从森林中选出两棵权值最小的二叉树 $T_1$ 和 $T_2$ 。
2. 创建一个新的二叉树 $T_{n1}$ ，其根节点的权值为 $T_1$ 和 $T_2$ 的权值之和， $T_1$ 和 $T_2$ 分别作为 $T_{n1}$ 的左子树和右子树。
3. 将新树 $T_{n1}$ 插入森林中，并移除 $T_1$ 和 $T_2$ 。

哈夫曼编码的生成#

生成哈夫曼编码的步骤如下：

从哈夫曼树的根节点开始，向左子树移动时添加0，向右子树移动时添加1。（也可以左1右0），或者理解成左边权=0,右边权=1
当到达叶节点时，记录下该叶节点对应的字符及其路径上的编码。
重复上述过程，直到所有叶节点的编码都被记录下来。

示例代码#

用cpp实现哈夫曼编码：

1
#include <bits/stdc++.h>
2
#include <filesystem>
3
using namespace std;
4

5
// 哈夫曼树节点结构体
6
struct Node {
7
    uint8_t symbol;  // 符号（字节值）
8
    uint64_t freq;  // 频率
9
    Node *Left, *Right;  // 左右子节点
10
    bool IsLeaf;  // 是否为叶子节点
11
    Node(uint8_t s, uint64_t f): symbol(s), freq(f), Left(nullptr), Right(nullptr), IsLeaf(true) {}  // 叶子节点构造函数
12
    Node(Node* l, Node* r): symbol(0), freq(l->freq + r->freq), Left(l), Right(r), IsLeaf(false) {}  // 内部节点构造函数
13
};
14

15
// 优先队列比较器，用于最小堆，每次push的时候都会触发判断，让小freq上浮
16
struct Cmp {
17
    bool operator()(const Node* a, const Node* b) const {
18
        if (a->freq != b->freq) return a->freq > b->freq;  // 频率小的优先
19
        return a->IsLeaf < b->IsLeaf;  // 叶子节点优先
20
    }
21
};
22

23
// 构建哈夫曼编码表
24
void buildCodes(Node* Root, vector<string>& hufCodes, string Cur="") {
25
    if (!Root) return;// 空节点返回
26
    if (Root->IsLeaf) {
27
        hufCodes[Root->symbol] = Cur.empty() ? "0" : Cur;  // 叶子节点设置编码
28
        return;
29
    }
30
    buildCodes(Root->Left, hufCodes, Cur + "0");  // 左子树编码加 "0"
31
    buildCodes(Root->Right, hufCodes, Cur + "1");  // 右子树编码加 "1"
32
}
33

34
// 构建哈夫曼树,返回根节点指针
35
Node* buildTree(const map<uint8_t, uint64_t>& charFreqMap) {
36
    if (charFreqMap.empty()) return nullptr;  // 空频率表返回空
37
    priority_queue<Node*, vector<Node*>, Cmp> pq;  // 最小堆priority_queue
38
    for (auto &p : charFreqMap) {
39
        pq.push(new Node(p.first, p.second));  // 推入叶子节点
40
        //这里采用new分配内存，确保树能长期存在
41
    }
42
    if (pq.size() == 1) {
43
        // 单符号情况，创建父节点
44
        Node* only = pq.top(); pq.pop();
45
        Node* parent = new Node(only, nullptr);
46
        return parent;
47
    }
48
    while (pq.size() > 1) {
49
        Node* A = pq.top(); pq.pop();//先访问后弹出
50
        Node* B = pq.top(); pq.pop();
51
        Node* Parent = new Node(A, B); //调用非叶子的构造函数，创建父节点
52
        pq.push(Parent);
53
    }
54
    return pq.top();
55
}
56

57
// 释放哈夫曼树内存
58
void freeTree(Node* Root) {
59
    if (!Root) return;
60
    freeTree(Root->Left);
61
    freeTree(Root->Right);
62
    delete Root;
63
}
64

65
// AI写的：打印哈夫曼树结构（树形显示）
66
void printTree(Node* node, string prefix = "", bool isLast = false) {
67
    if (!node) return;
68
    cout << prefix;
69
    if (!prefix.empty()) cout << (isLast ? "└── " : "├── ");
70
    if (node->IsLeaf) {
71
        if (isprint(node->symbol)) {
72
            cout << "'" << char(node->symbol) << "' (" << node->freq << ")" << endl;
73
        } else {
74
            cout << "0x" << hex << node->symbol << " (" << dec << node->freq << ")" << endl;
75
        }
76
    } else {
77
        cout << "分支 (" << node->freq << ")" << endl;
78
    }
79
    string newPrefix = prefix + (isLast ? "    " : "│   ");
80
    if (node->Left) printTree(node->Left, newPrefix, node->Right == nullptr);
81
    if (node->Right) printTree(node->Right, newPrefix, true);
82
}
83

84
// 对文件进行哈夫曼编码
85
bool encodeFile(const string& inPath, const string& outPath) {
86
    ifstream in(inPath, ios::binary);//用二进制模式打开输入文件
87
    if (!in) return false;
88
    //用istreambuf构建一个字节数据向量，char表示每个元素是一个字母对应的ASCII码
89
    //data存储转成哈夫曼编码之后的字节流
90
    vector<uint8_t> Data((istreambuf_iterator<char>(in)), istreambuf_iterator<char>());
91
    in.close();
92

93
    map<uint8_t, uint64_t> charFreqMap;//计算字符频率,键是字节值（uint8_t），值是频率(uint64_t)
94
    //这里data里面存的是ASCII码对应的字节值，B就是ASCII码
95
    for (uint8_t asciiChar : Data) charFreqMap[asciiChar]++;
96

97
    Node* Root = buildTree(charFreqMap);//根据频率表构建哈夫曼树
98
    vector<string> hufCodes(256);//256个可能的字节值对应的ascii编码字符串0～255
99
    if (Root) buildCodes(Root, hufCodes);//构建哈夫曼编码表
100

101
    // 打印哈夫曼树结构
102
    cout << "哈夫曼树结构:" << endl;
103
    printTree(Root);//(AI写的)
104
    cout << endl;
105

106
    // 将数据编码为比特流（位操作）
107
    vector<uint8_t> outBytes;
108
    uint8_t currentByte = 0;// 当前正在构建的字节，默认值为0
109
    int bitpos = 0;  // 当前字节的位位置（0-7，从高位开始）
110
    string BitStreamStr;  // 01序列字符串
111
    for (uint8_t asciiChar : Data) {
112
        const string& code = hufCodes[asciiChar];//用ascii码访问编码表
113
        for (char c : code) {
114
            BitStreamStr += c;  // 累积01序列
115
            // '1' 设置为 1，'0' 设置为 0
116
            if (c == '1') {
117
                currentByte |= (1 << (7 - bitpos));
118
            }
119
            bitpos++;
120
            if (bitpos == 8) {//每积累满8位，形成一个完整字节
121
                outBytes.push_back(currentByte);
122
                currentByte = 0;
123
                bitpos = 0;
124
            }
125
        }
126
    }
127
    // 处理填充：添加 '0' 直到字节边界
128
    int padding = (8 - bitpos) % 8;
129
    for (int i = 0; i < padding; ++i) {
130
        BitStreamStr += '0';  // 填充 '0'
131
        bitpos++;
132
        if (bitpos == 8) {
133
            outBytes.push_back(currentByte);
134
            currentByte = 0;
135
            bitpos = 0;
136
        }
137
    }
138
    if (bitpos > 0) {
139
        outBytes.push_back(currentByte);  // 最后不完整的字节
140
    }
141

142
    // 输出编码表和01序列到文本文件
143
    ofstream codes_out("codes.txt");
144
    for (int i = 0; i < 256; ++i) {
145
        if (!hufCodes[i].empty()) {
146
            if (isprint(i)) {
147
                codes_out << "'" << char(i) << "' (" << i << "): " << hufCodes[i] << "\n";
148
            } else {
149
                codes_out << "0x" << hex << i << ": " << hufCodes[i] << "\n";
150
            }
151
        }
152
    }
153
    codes_out.close();
154

155
    ofstream bitstream_out("bitstream.txt");
156
    bitstream_out << BitStreamStr << "\n";
157
    bitstream_out.close();
158

159
    // 写入文件头：魔数 + 符号数量 + (符号, 编码长度, 编码位串) + 填充位 + 数据
160
    ofstream out(outPath, ios::binary);
161
    if (!out) { freeTree(Root); return false; }
162
    out.write("HUF2", 4);  // 更新魔数为 HUF2，表示新格式
163
    uint16_t Cnt = 0;// 有效符号数量
164
    for (auto& code : hufCodes) if (!code.empty()) Cnt++;
165
    out.write(reinterpret_cast<const char*>(&Cnt), sizeof(Cnt));
166
    for (int i = 0; i < 256; ++i) {
167
        if (!hufCodes[i].empty()) {
168
            uint8_t S = (uint8_t)i;// 符号对应的ascii码
169
            uint8_t Len = (uint8_t)hufCodes[i].size();// 编码长度
170
            out.write(reinterpret_cast<const char*>(&S), sizeof(S));
171
            out.write(reinterpret_cast<const char*>(&Len), sizeof(Len));
172
            // 将编码字符串转换为字节写入
173
            uint8_t codeByte = 0;
174
            int bitPos = 7;
175
            for (char c : hufCodes[i]) {
176
                if (c == '1') codeByte |= (1 << bitPos);
177
                bitPos--;
178
                if (bitPos < 0) {
179
                    out.write(reinterpret_cast<const char*>(&codeByte), sizeof(codeByte));
180
                    codeByte = 0;
181
                    bitPos = 7;
182
                }
183
            }
184
            if (bitPos < 7) out.write(reinterpret_cast<const char*>(&codeByte), sizeof(codeByte));
185
        }
186
    }
187
    uint8_t Pad8 = (uint8_t)padding;
188
    out.write(reinterpret_cast<const char*>(&Pad8), sizeof(Pad8));
189
    if (!outBytes.empty()) out.write(reinterpret_cast<const char*>(outBytes.data()), outBytes.size());
190
    out.close();
191

192
    size_t Orig = filesystem::file_size(inPath);
193
    size_t Comp = filesystem::file_size(outPath);
194
    cout << "原始大小: " << Orig << " 字节\n";
195
    cout << "压缩后大小: " << Comp << " 字节\n";
196
    double Rate = Orig ? (double(Comp) / double(Orig) * 100.0) : 0.0;
197
    cout << fixed << setprecision(2) << "压缩率: " << Rate << "%\n";
198

199
    freeTree(Root);
200
    return true;
201
}
202

203
// 对文件进行哈夫曼译码
204
bool decodeFile(const string& InPath, const string& OutPath) {
205
    ifstream In(InPath, ios::binary);
206
    if (!In) return false;
207
    // 读取并验证魔数 "HUF2"，确保是有效的HUF文件
208
    char Magic[4];
209
    In.read(Magic, 4);
210
    if (In.gcount() != 4 || strncmp(Magic, "HUF2", 4) != 0) {
211
        cerr << "不是有效的 HUF 文件\n";
212
        return false;
213
    }
214
    // 读取符号数量 Cnt，然后读取 Cnt 个 (符号, 编码长度, 编码位串)
215
    uint16_t Cnt;
216
    In.read(reinterpret_cast<char*>(&Cnt), sizeof(Cnt));
217
    map<string, uint8_t> codeToSymbol;  // 编码到符号的映射
218
    for (int i = 0; i < Cnt; ++i) {//大循环读编码表，cnt次读取符号和编码
219
        uint8_t S;//ascii码
220
        uint8_t Len;//编码长度,多少bit
221
        In.read(reinterpret_cast<char*>(&S), sizeof(S));
222
        In.read(reinterpret_cast<char*>(&Len), sizeof(Len));
223
        string code;// 读取编码位串
224
        int bitsRead = 0;
225
        while (bitsRead < Len) {
226
            uint8_t byte;
227
            In.read(reinterpret_cast<char*>(&byte), sizeof(byte));
228
            for (int b = 7; b >= 0 && bitsRead < Len; --b) {
229
                code += ((byte >> b) & 1) ? '1' : '0';
230
                bitsRead++;
231
            }
232
        }
233
        codeToSymbol[code] = S;
234
    }
235

236
    // 读取填充位数
237
    uint8_t padding8;
238
    In.read(reinterpret_cast<char*>(&padding8), sizeof(padding8));
239

240
    // 读取压缩数据，采用istreambuf_iterator构建把bit打包成字节向量
241
    vector<uint8_t> Data((istreambuf_iterator<char>(In)), istreambuf_iterator<char>());
242
    In.close();
243

244
    // 译码：直接使用编码字典匹配
245
    vector<uint8_t> OutBytes;
246
    string currentCode;//从左往右逐位读取
247
    size_t byte_index = 0;//data的字节索引
248
    int bit_index = 7;//data每一个元素的位索引
249
    size_t total_bits = Data.size() * 8 - padding8;
250

251
    while (byte_index < Data.size()) {
252
        uint8_t byte = Data[byte_index];
253
        bool bit = (byte >> bit_index) & 1;
254
        currentCode += bit ? '1' : '0';
255

256
        // 每读取一位，检查是否匹配编码
257
        if (codeToSymbol.count(currentCode)) {
258
            OutBytes.push_back(codeToSymbol[currentCode]);
259
            currentCode.clear();
260
        }
261

262
        bit_index--;
263
        if (bit_index < 0) {
264
            bit_index = 7;
265
            byte_index++;
266
        }
267

268
        // 防止无限循环，如果读取超过总位数
269
        if ((byte_index * 8 + (7 - bit_index)) >= total_bits) break;
270
    }
271

272
    ofstream Out(OutPath, ios::binary);
273
    Out.write(reinterpret_cast<const char*>(OutBytes.data()), OutBytes.size());
274
    Out.close();
275
    return true;
276
}
277

278
void usage() {
279
    cout << "用法:\n";
280
    cout << "  huffman encode <输入文件> <输出文件>\n";
281
    cout << "  huffman decode <输入文件.huf> <输出文件>\n";
282
}
283

284
int main(int argc, char** argv) {
285
    if (argc != 4) { usage(); return 1; }
286
    string cmd = argv[1];
287
    string inp = argv[2];
288
    string outp = argv[3];
289
    if (cmd == "encode") {
290
        if (!encodeFile(inp, outp)) { cerr << "编码失败\n"; return 2; }
291
    } else if (cmd == "decode") {
292
        if (!decodeFile(inp, outp)) { cerr << "译码失败\n"; return 3; }
293
    } else {
294
        usage(); return 1;
295
    }
296
    return 0;
297
}