zlib: optimize inffast when copying direct from output
[safe/jmp/linux-2.6] / lib / zlib_inflate / inffast.c
1 /* inffast.c -- fast decoding
2  * Copyright (C) 1995-2004 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  */
5
6 #include <linux/zutil.h>
7 #include <asm/unaligned.h>
8 #include <asm/byteorder.h>
9 #include "inftrees.h"
10 #include "inflate.h"
11 #include "inffast.h"
12
13 #ifndef ASMINF
14
15 /* Allow machine dependent optimization for post-increment or pre-increment.
16    Based on testing to date,
17    Pre-increment preferred for:
18    - PowerPC G3 (Adler)
19    - MIPS R5000 (Randers-Pehrson)
20    Post-increment preferred for:
21    - none
22    No measurable difference:
23    - Pentium III (Anderson)
24    - M68060 (Nikl)
25  */
26 #ifdef POSTINC
27 #  define OFF 0
28 #  define PUP(a) *(a)++
29 #  define UP_UNALIGNED(a) get_unaligned((a)++)
30 #else
31 #  define OFF 1
32 #  define PUP(a) *++(a)
33 #  define UP_UNALIGNED(a) get_unaligned(++(a))
34 #endif
35
36 /*
37    Decode literal, length, and distance codes and write out the resulting
38    literal and match bytes until either not enough input or output is
39    available, an end-of-block is encountered, or a data error is encountered.
40    When large enough input and output buffers are supplied to inflate(), for
41    example, a 16K input buffer and a 64K output buffer, more than 95% of the
42    inflate execution time is spent in this routine.
43
44    Entry assumptions:
45
46         state->mode == LEN
47         strm->avail_in >= 6
48         strm->avail_out >= 258
49         start >= strm->avail_out
50         state->bits < 8
51
52    On return, state->mode is one of:
53
54         LEN -- ran out of enough output space or enough available input
55         TYPE -- reached end of block code, inflate() to interpret next block
56         BAD -- error in block data
57
58    Notes:
59
60     - The maximum input bits used by a length/distance pair is 15 bits for the
61       length code, 5 bits for the length extra, 15 bits for the distance code,
62       and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
63       Therefore if strm->avail_in >= 6, then there is enough input to avoid
64       checking for available input while decoding.
65
66     - The maximum bytes that a single length/distance pair can output is 258
67       bytes, which is the maximum length that can be coded.  inflate_fast()
68       requires strm->avail_out >= 258 for each loop to avoid checking for
69       output space.
70
71     - @start:   inflate()'s starting value for strm->avail_out
72  */
73 void inflate_fast(z_streamp strm, unsigned start)
74 {
75     struct inflate_state *state;
76     const unsigned char *in;    /* local strm->next_in */
77     const unsigned char *last;  /* while in < last, enough input available */
78     unsigned char *out;         /* local strm->next_out */
79     unsigned char *beg;         /* inflate()'s initial strm->next_out */
80     unsigned char *end;         /* while out < end, enough space available */
81 #ifdef INFLATE_STRICT
82     unsigned dmax;              /* maximum distance from zlib header */
83 #endif
84     unsigned wsize;             /* window size or zero if not using window */
85     unsigned whave;             /* valid bytes in the window */
86     unsigned write;             /* window write index */
87     unsigned char *window;      /* allocated sliding window, if wsize != 0 */
88     unsigned long hold;         /* local strm->hold */
89     unsigned bits;              /* local strm->bits */
90     code const *lcode;          /* local strm->lencode */
91     code const *dcode;          /* local strm->distcode */
92     unsigned lmask;             /* mask for first level of length codes */
93     unsigned dmask;             /* mask for first level of distance codes */
94     code this;                  /* retrieved table entry */
95     unsigned op;                /* code bits, operation, extra bits, or */
96                                 /*  window position, window bytes to copy */
97     unsigned len;               /* match length, unused bytes */
98     unsigned dist;              /* match distance */
99     unsigned char *from;        /* where to copy match from */
100
101     /* copy state to local variables */
102     state = (struct inflate_state *)strm->state;
103     in = strm->next_in - OFF;
104     last = in + (strm->avail_in - 5);
105     out = strm->next_out - OFF;
106     beg = out - (start - strm->avail_out);
107     end = out + (strm->avail_out - 257);
108 #ifdef INFLATE_STRICT
109     dmax = state->dmax;
110 #endif
111     wsize = state->wsize;
112     whave = state->whave;
113     write = state->write;
114     window = state->window;
115     hold = state->hold;
116     bits = state->bits;
117     lcode = state->lencode;
118     dcode = state->distcode;
119     lmask = (1U << state->lenbits) - 1;
120     dmask = (1U << state->distbits) - 1;
121
122     /* decode literals and length/distances until end-of-block or not enough
123        input data or output space */
124     do {
125         if (bits < 15) {
126             hold += (unsigned long)(PUP(in)) << bits;
127             bits += 8;
128             hold += (unsigned long)(PUP(in)) << bits;
129             bits += 8;
130         }
131         this = lcode[hold & lmask];
132       dolen:
133         op = (unsigned)(this.bits);
134         hold >>= op;
135         bits -= op;
136         op = (unsigned)(this.op);
137         if (op == 0) {                          /* literal */
138             PUP(out) = (unsigned char)(this.val);
139         }
140         else if (op & 16) {                     /* length base */
141             len = (unsigned)(this.val);
142             op &= 15;                           /* number of extra bits */
143             if (op) {
144                 if (bits < op) {
145                     hold += (unsigned long)(PUP(in)) << bits;
146                     bits += 8;
147                 }
148                 len += (unsigned)hold & ((1U << op) - 1);
149                 hold >>= op;
150                 bits -= op;
151             }
152             if (bits < 15) {
153                 hold += (unsigned long)(PUP(in)) << bits;
154                 bits += 8;
155                 hold += (unsigned long)(PUP(in)) << bits;
156                 bits += 8;
157             }
158             this = dcode[hold & dmask];
159           dodist:
160             op = (unsigned)(this.bits);
161             hold >>= op;
162             bits -= op;
163             op = (unsigned)(this.op);
164             if (op & 16) {                      /* distance base */
165                 dist = (unsigned)(this.val);
166                 op &= 15;                       /* number of extra bits */
167                 if (bits < op) {
168                     hold += (unsigned long)(PUP(in)) << bits;
169                     bits += 8;
170                     if (bits < op) {
171                         hold += (unsigned long)(PUP(in)) << bits;
172                         bits += 8;
173                     }
174                 }
175                 dist += (unsigned)hold & ((1U << op) - 1);
176 #ifdef INFLATE_STRICT
177                 if (dist > dmax) {
178                     strm->msg = (char *)"invalid distance too far back";
179                     state->mode = BAD;
180                     break;
181                 }
182 #endif
183                 hold >>= op;
184                 bits -= op;
185                 op = (unsigned)(out - beg);     /* max distance in output */
186                 if (dist > op) {                /* see if copy from window */
187                     op = dist - op;             /* distance back in window */
188                     if (op > whave) {
189                         strm->msg = (char *)"invalid distance too far back";
190                         state->mode = BAD;
191                         break;
192                     }
193                     from = window - OFF;
194                     if (write == 0) {           /* very common case */
195                         from += wsize - op;
196                         if (op < len) {         /* some from window */
197                             len -= op;
198                             do {
199                                 PUP(out) = PUP(from);
200                             } while (--op);
201                             from = out - dist;  /* rest from output */
202                         }
203                     }
204                     else if (write < op) {      /* wrap around window */
205                         from += wsize + write - op;
206                         op -= write;
207                         if (op < len) {         /* some from end of window */
208                             len -= op;
209                             do {
210                                 PUP(out) = PUP(from);
211                             } while (--op);
212                             from = window - OFF;
213                             if (write < len) {  /* some from start of window */
214                                 op = write;
215                                 len -= op;
216                                 do {
217                                     PUP(out) = PUP(from);
218                                 } while (--op);
219                                 from = out - dist;      /* rest from output */
220                             }
221                         }
222                     }
223                     else {                      /* contiguous in window */
224                         from += write - op;
225                         if (op < len) {         /* some from window */
226                             len -= op;
227                             do {
228                                 PUP(out) = PUP(from);
229                             } while (--op);
230                             from = out - dist;  /* rest from output */
231                         }
232                     }
233                     while (len > 2) {
234                         PUP(out) = PUP(from);
235                         PUP(out) = PUP(from);
236                         PUP(out) = PUP(from);
237                         len -= 3;
238                     }
239                     if (len) {
240                         PUP(out) = PUP(from);
241                         if (len > 1)
242                             PUP(out) = PUP(from);
243                     }
244                 }
245                 else {
246                     unsigned short *sout;
247                     unsigned long loops;
248
249                     from = out - dist;          /* copy direct from output */
250                     /* minimum length is three */
251                     /* Align out addr */
252                     if (!((long)(out - 1 + OFF) & 1)) {
253                         PUP(out) = PUP(from);
254                         len--;
255                     }
256                     sout = (unsigned short *)(out - OFF);
257                     if (dist > 2) {
258                         unsigned short *sfrom;
259
260                         sfrom = (unsigned short *)(from - OFF);
261                         loops = len >> 1;
262                         do
263                             PUP(sout) = UP_UNALIGNED(sfrom);
264                         while (--loops);
265                         out = (unsigned char *)sout + OFF;
266                         from = (unsigned char *)sfrom + OFF;
267                     } else { /* dist == 1 or dist == 2 */
268                         unsigned short pat16;
269
270                         pat16 = *(sout-2+2*OFF);
271                         if (dist == 1)
272 #if defined(__BIG_ENDIAN)
273                             pat16 = (pat16 & 0xff) | ((pat16 & 0xff) << 8);
274 #elif defined(__LITTLE_ENDIAN)
275                             pat16 = (pat16 & 0xff00) | ((pat16 & 0xff00) >> 8);
276 #else
277 #error __BIG_ENDIAN nor __LITTLE_ENDIAN is defined
278 #endif
279                         loops = len >> 1;
280                         do
281                             PUP(sout) = pat16;
282                         while (--loops);
283                         out = (unsigned char *)sout + OFF;
284                     }
285                     if (len & 1)
286                         PUP(out) = PUP(from);
287                 }
288             }
289             else if ((op & 64) == 0) {          /* 2nd level distance code */
290                 this = dcode[this.val + (hold & ((1U << op) - 1))];
291                 goto dodist;
292             }
293             else {
294                 strm->msg = (char *)"invalid distance code";
295                 state->mode = BAD;
296                 break;
297             }
298         }
299         else if ((op & 64) == 0) {              /* 2nd level length code */
300             this = lcode[this.val + (hold & ((1U << op) - 1))];
301             goto dolen;
302         }
303         else if (op & 32) {                     /* end-of-block */
304             state->mode = TYPE;
305             break;
306         }
307         else {
308             strm->msg = (char *)"invalid literal/length code";
309             state->mode = BAD;
310             break;
311         }
312     } while (in < last && out < end);
313
314     /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
315     len = bits >> 3;
316     in -= len;
317     bits -= len << 3;
318     hold &= (1U << bits) - 1;
319
320     /* update state and return */
321     strm->next_in = in + OFF;
322     strm->next_out = out + OFF;
323     strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
324     strm->avail_out = (unsigned)(out < end ?
325                                  257 + (end - out) : 257 - (out - end));
326     state->hold = hold;
327     state->bits = bits;
328     return;
329 }
330
331 /*
332    inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
333    - Using bit fields for code structure
334    - Different op definition to avoid & for extra bits (do & for table bits)
335    - Three separate decoding do-loops for direct, window, and write == 0
336    - Special case for distance > 1 copies to do overlapped load and store copy
337    - Explicit branch predictions (based on measured branch probabilities)
338    - Deferring match copy and interspersed it with decoding subsequent codes
339    - Swapping literal/length else
340    - Swapping window/direct else
341    - Larger unrolled copy loops (three is about right)
342    - Moving len -= 3 statement into middle of loop
343  */
344
345 #endif /* !ASMINF */