[GitHub/exynos8895/android_kernel_samsung_universal8895.git] / crypto / gf128mul.c

/* gf128mul.c - GF(2^128) multiplication functions
 *
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
 *
 * Based on Dr Brian Gladman's (GPL'd) work published at
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
 * See the original copyright notice below.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 */

/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
*/

#include <crypto/gf128mul.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

#define gf128mul_dat(q) { \
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
}

/*
 * Given a value i in 0..255 as the byte overflow when a field element
 * in GF(2^128) is multiplied by x^8, the following macro returns the
 * 16-bit value that must be XOR-ed into the low-degree end of the
 * product to reduce it modulo the irreducible polynomial x^128 + x^7 +
 * x^2 + x + 1.
 *
 * There are two versions of the macro, and hence two tables: one for
 * the "be" convention where the highest-order bit is the coefficient of
 * the highest-degree polynomial term, and one for the "le" convention
 * where the highest-order bit is the coefficient of the lowest-degree
 * polynomial term.  In both cases the values are stored in CPU byte
 * endianness such that the coefficients are ordered consistently across
 * bytes, i.e. in the "be" table bits 15..0 of the stored value
 * correspond to the coefficients of x^15..x^0, and in the "le" table
 * bits 15..0 correspond to the coefficients of x^0..x^15.
 *
 * Therefore, provided that the appropriate byte endianness conversions
 * are done by the multiplication functions (and these must be in place
 * anyway to support both little endian and big endian CPUs), the "be"
 * table can be used for multiplications of both "bbe" and "ble"
 * elements, and the "le" table can be used for multiplications of both
 * "lle" and "lbe" elements.
 */

#define xda_be(i) ( \
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
)

#define xda_le(i) ( \
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
)

static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);

/*
 * The following functions multiply a field element by x or by x^8 in
 * the polynomial field representation.  They use 64-bit word operations
 * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */

static void gf128mul_x_lle(be128 *r, const be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];

	r->b = cpu_to_be64((b >> 1) | (a << 63));
	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
}

static void gf128mul_x_bbe(be128 *r, const be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 63];

	r->a = cpu_to_be64((a << 1) | (b >> 63));
	r->b = cpu_to_be64((b << 1) ^ _tt);
}

void gf128mul_x_ble(be128 *r, const be128 *x)
{
	u64 a = le64_to_cpu(x->a);
	u64 b = le64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[b >> 63];

	r->a = cpu_to_le64((a << 1) ^ _tt);
	r->b = cpu_to_le64((b << 1) | (a >> 63));
}
EXPORT_SYMBOL(gf128mul_x_ble);

static void gf128mul_x8_lle(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[b & 0xff];

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

static void gf128mul_x8_bbe(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	x->a = cpu_to_be64((a << 8) | (b >> 56));
	x->b = cpu_to_be64((b << 8) ^ _tt);
}

static void gf128mul_x8_ble(be128 *x)
{
	u64 a = le64_to_cpu(x->b);
	u64 b = le64_to_cpu(x->a);
	u64 _tt = gf128mul_table_be[a >> 56];

	x->b = cpu_to_le64((a << 8) | (b >> 56));
	x->a = cpu_to_le64((b << 8) ^ _tt);
}

void gf128mul_lle(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_lle(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		if (ch & 0x80)
			be128_xor(r, r, &p[0]);
		if (ch & 0x40)
			be128_xor(r, r, &p[1]);
		if (ch & 0x20)
			be128_xor(r, r, &p[2]);
		if (ch & 0x10)
			be128_xor(r, r, &p[3]);
		if (ch & 0x08)
			be128_xor(r, r, &p[4]);
		if (ch & 0x04)
			be128_xor(r, r, &p[5]);
		if (ch & 0x02)
			be128_xor(r, r, &p[6]);
		if (ch & 0x01)
			be128_xor(r, r, &p[7]);

		if (++i >= 16)
			break;

		gf128mul_x8_lle(r);
	}
}
EXPORT_SYMBOL(gf128mul_lle);

void gf128mul_bbe(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_bbe(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_bbe(r);
	}
}
EXPORT_SYMBOL(gf128mul_bbe);

void gf128mul_ble(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_ble((be128 *)&p[i + 1], (be128 *)&p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_ble(r);
	}
}
EXPORT_SYMBOL(gf128mul_ble);


/*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
    need tables for each of the 16 higher bytes in the
    buffer as well, which makes 64 kbytes in total.
*/
/* additional explanation
 * t[0][BYTE] contains g*BYTE
 * t[1][BYTE] contains g*x^8*BYTE
 *  ..
 * t[15][BYTE] contains g*x^120*BYTE */
struct gf128mul_64k *gf128mul_init_64k_lle(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[0]->t[j], &t->t[0]->t[j + j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_lle(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_lle);

struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_bbe(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_bbe);

void gf128mul_free_64k(struct gf128mul_64k *t)
{
	int i;

	for (i = 0; i < 16; i++)
		kzfree(t->t[i]);
	kzfree(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);

void gf128mul_64k_lle(be128 *a, struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[0]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_lle);

void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[15]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_bbe);

/*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
    the buffer and use this table to get the result, we then
    have to multiply by x^120 to get the final value. For the
    next highest byte the result has to be multiplied by x^112
    and so on. But we can do this by accumulating the result
    in an accumulator starting with the result for the top
    byte.  We repeatedly multiply the accumulator value by
    x^8 and then add in (i.e. xor) the 16 bytes of the next
    lower byte in the buffer, stopping when we reach the
    lowest byte. This requires a 4096 byte table.
*/
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);

struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_bbe);

struct gf128mul_4k *gf128mul_init_4k_ble(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_ble(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_ble);

void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_lle(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_lle);

void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 0;

	*r = t->t[ap[0]];
	while (++i < 16) {
		gf128mul_x8_bbe(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_bbe);

void gf128mul_4k_ble(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_ble(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_ble);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
Commit	Line	Data
c494e070 RS	1	/* gf128mul.c - GF(2^128) multiplication functions
	2	*
	3	* Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
	4	* Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
	5	*
	6	* Based on Dr Brian Gladman's (GPL'd) work published at
8c882f64	7	* http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
c494e070 RS	8	* See the original copyright notice below.
	9	*
	10	* This program is free software; you can redistribute it and/or modify it
	11	* under the terms of the GNU General Public License as published by the Free
	12	* Software Foundation; either version 2 of the License, or (at your option)
	13	* any later version.
	14	*/
	15
	16	/*
	17	---------------------------------------------------------------------------
	18	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	19
	20	LICENSE TERMS
	21
	22	The free distribution and use of this software in both source and binary
	23	form is allowed (with or without changes) provided that:
	24
	25	1. distributions of this source code include the above copyright
	26	notice, this list of conditions and the following disclaimer;
	27
	28	2. distributions in binary form include the above copyright
	29	notice, this list of conditions and the following disclaimer
	30	in the documentation and/or other associated materials;
	31
	32	3. the copyright holder's name is not used to endorse products
	33	built using this software without specific written permission.
	34
	35	ALTERNATIVELY, provided that this notice is retained in full, this product
	36	may be distributed under the terms of the GNU General Public License (GPL),
	37	in which case the provisions of the GPL apply INSTEAD OF those given above.
	38
	39	DISCLAIMER
	40
	41	This software is provided 'as is' with no explicit or implied warranties
	42	in respect of its properties, including, but not limited to, correctness
	43	and/or fitness for purpose.
	44	---------------------------------------------------------------------------
	45	Issue 31/01/2006
	46
ce2ace45	47	This file provides fast multiplication in GF(2^128) as required by several
c494e070 RS	48	cryptographic authentication modes
	49	*/
	50
	51	#include <crypto/gf128mul.h>
	52	#include <linux/kernel.h>
	53	#include <linux/module.h>
	54	#include <linux/slab.h>
	55
	56	#define gf128mul_dat(q) { \
	57	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	58	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	59	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	60	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	61	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	62	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	63	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	64	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	65	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	66	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	67	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	68	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	69	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	70	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	71	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	72	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	73	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	74	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	75	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	76	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	77	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	78	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	79	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	80	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	81	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	82	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	83	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	84	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	85	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	86	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	87	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	88	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
	89	}
	90
3eaf06b7 EB	91	/*
	92	* Given a value i in 0..255 as the byte overflow when a field element
	93	* in GF(2^128) is multiplied by x^8, the following macro returns the
	94	* 16-bit value that must be XOR-ed into the low-degree end of the
	95	* product to reduce it modulo the irreducible polynomial x^128 + x^7 +
	96	* x^2 + x + 1.
	97	*
	98	* There are two versions of the macro, and hence two tables: one for
	99	* the "be" convention where the highest-order bit is the coefficient of
	100	* the highest-degree polynomial term, and one for the "le" convention
	101	* where the highest-order bit is the coefficient of the lowest-degree
	102	* polynomial term. In both cases the values are stored in CPU byte
	103	* endianness such that the coefficients are ordered consistently across
	104	* bytes, i.e. in the "be" table bits 15..0 of the stored value
	105	* correspond to the coefficients of x^15..x^0, and in the "le" table
	106	* bits 15..0 correspond to the coefficients of x^0..x^15.
	107	*
	108	* Therefore, provided that the appropriate byte endianness conversions
	109	* are done by the multiplication functions (and these must be in place
	110	* anyway to support both little endian and big endian CPUs), the "be"
	111	* table can be used for multiplications of both "bbe" and "ble"
	112	* elements, and the "le" table can be used for multiplications of both
	113	* "lle" and "lbe" elements.
	114	*/
c494e070	115
3eaf06b7 EB	116	#define xda_be(i) ( \
	117	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	118	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	119	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	120	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
c494e070 RS	121	)
c494e070 RS	122
3eaf06b7 EB	123	#define xda_le(i) ( \
	124	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	125	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	126	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	127	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
c494e070 RS	128	)
c494e070 RS	129
3eaf06b7 EB	130	static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
3eaf06b7 EB	131	static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
c494e070	132
ce2ace45 AC	133	/*
	134	* The following functions multiply a field element by x or by x^8 in
	135	* the polynomial field representation. They use 64-bit word operations
	136	* to gain speed but compensate for machine endianness and hence work
c494e070 RS	137	* correctly on both styles of machine.
	138	*/
	139
	140	static void gf128mul_x_lle(be128 r, const be128 x)
	141	{
	142	u64 a = be64_to_cpu(x->a);
	143	u64 b = be64_to_cpu(x->b);
3eaf06b7	144	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
c494e070 RS	145
	146	r->b = cpu_to_be64((b >> 1) \| (a << 63));
	147	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
	148	}
	149
	150	static void gf128mul_x_bbe(be128 r, const be128 x)
	151	{
	152	u64 a = be64_to_cpu(x->a);
	153	u64 b = be64_to_cpu(x->b);
3eaf06b7	154	u64 _tt = gf128mul_table_be[a >> 63];
c494e070 RS	155
	156	r->a = cpu_to_be64((a << 1) \| (b >> 63));
	157	r->b = cpu_to_be64((b << 1) ^ _tt);
	158	}
	159
f19f5111 RS	160	void gf128mul_x_ble(be128 r, const be128 x)
	161	{
	162	u64 a = le64_to_cpu(x->a);
	163	u64 b = le64_to_cpu(x->b);
3eaf06b7	164	u64 _tt = gf128mul_table_be[b >> 63];
f19f5111 RS	165
	166	r->a = cpu_to_le64((a << 1) ^ _tt);
	167	r->b = cpu_to_le64((b << 1) \| (a >> 63));
	168	}
	169	EXPORT_SYMBOL(gf128mul_x_ble);
	170
c494e070 RS	171	static void gf128mul_x8_lle(be128 *x)
	172	{
	173	u64 a = be64_to_cpu(x->a);
	174	u64 b = be64_to_cpu(x->b);
3eaf06b7	175	u64 _tt = gf128mul_table_le[b & 0xff];
c494e070 RS	176
	177	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	178	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	179	}
	180
	181	static void gf128mul_x8_bbe(be128 *x)
	182	{
	183	u64 a = be64_to_cpu(x->a);
	184	u64 b = be64_to_cpu(x->b);
3eaf06b7	185	u64 _tt = gf128mul_table_be[a >> 56];
c494e070 RS	186
	187	x->a = cpu_to_be64((a << 8) \| (b >> 56));
	188	x->b = cpu_to_be64((b << 8) ^ _tt);
	189	}
	190
ce2ace45 AC	191	static void gf128mul_x8_ble(be128 *x)
	192	{
	193	u64 a = le64_to_cpu(x->b);
	194	u64 b = le64_to_cpu(x->a);
	195	u64 _tt = gf128mul_table_be[a >> 56];
	196
	197	x->b = cpu_to_le64((a << 8) \| (b >> 56));
	198	x->a = cpu_to_le64((b << 8) ^ _tt);
	199	}
	200
c494e070 RS	201	void gf128mul_lle(be128 r, const be128 b)
	202	{
	203	be128 p[8];
	204	int i;
	205
	206	p[0] = *r;
	207	for (i = 0; i < 7; ++i)
	208	gf128mul_x_lle(&p[i + 1], &p[i]);
	209
62542663	210	memset(r, 0, sizeof(*r));
c494e070 RS	211	for (i = 0;;) {
	212	u8 ch = ((u8 *)b)[15 - i];
	213
	214	if (ch & 0x80)
	215	be128_xor(r, r, &p[0]);
	216	if (ch & 0x40)
	217	be128_xor(r, r, &p[1]);
	218	if (ch & 0x20)
	219	be128_xor(r, r, &p[2]);
	220	if (ch & 0x10)
	221	be128_xor(r, r, &p[3]);
	222	if (ch & 0x08)
	223	be128_xor(r, r, &p[4]);
	224	if (ch & 0x04)
	225	be128_xor(r, r, &p[5]);
	226	if (ch & 0x02)
	227	be128_xor(r, r, &p[6]);
	228	if (ch & 0x01)
	229	be128_xor(r, r, &p[7]);
	230
	231	if (++i >= 16)
	232	break;
	233
	234	gf128mul_x8_lle(r);
	235	}
	236	}
	237	EXPORT_SYMBOL(gf128mul_lle);
	238
	239	void gf128mul_bbe(be128 r, const be128 b)
	240	{
	241	be128 p[8];
	242	int i;
	243
	244	p[0] = *r;
	245	for (i = 0; i < 7; ++i)
	246	gf128mul_x_bbe(&p[i + 1], &p[i]);
	247
62542663	248	memset(r, 0, sizeof(*r));
c494e070 RS	249	for (i = 0;;) {
	250	u8 ch = ((u8 *)b)[i];
	251
	252	if (ch & 0x80)
	253	be128_xor(r, r, &p[7]);
	254	if (ch & 0x40)
	255	be128_xor(r, r, &p[6]);
	256	if (ch & 0x20)
	257	be128_xor(r, r, &p[5]);
	258	if (ch & 0x10)
	259	be128_xor(r, r, &p[4]);
	260	if (ch & 0x08)
	261	be128_xor(r, r, &p[3]);
	262	if (ch & 0x04)
	263	be128_xor(r, r, &p[2]);
	264	if (ch & 0x02)
	265	be128_xor(r, r, &p[1]);
	266	if (ch & 0x01)
	267	be128_xor(r, r, &p[0]);
	268
	269	if (++i >= 16)
	270	break;
	271
	272	gf128mul_x8_bbe(r);
	273	}
	274	}
	275	EXPORT_SYMBOL(gf128mul_bbe);
	276
ce2ace45 AC	277	void gf128mul_ble(be128 r, const be128 b)
	278	{
	279	be128 p[8];
	280	int i;
	281
	282	p[0] = *r;
	283	for (i = 0; i < 7; ++i)
	284	gf128mul_x_ble((be128 )&p[i + 1], (be128 )&p[i]);
	285
	286	memset(r, 0, sizeof(*r));
	287	for (i = 0;;) {
	288	u8 ch = ((u8 *)b)[15 - i];
	289
	290	if (ch & 0x80)
	291	be128_xor(r, r, &p[7]);
	292	if (ch & 0x40)
	293	be128_xor(r, r, &p[6]);
	294	if (ch & 0x20)
	295	be128_xor(r, r, &p[5]);
	296	if (ch & 0x10)
	297	be128_xor(r, r, &p[4]);
	298	if (ch & 0x08)
	299	be128_xor(r, r, &p[3]);
	300	if (ch & 0x04)
	301	be128_xor(r, r, &p[2]);
	302	if (ch & 0x02)
	303	be128_xor(r, r, &p[1]);
	304	if (ch & 0x01)
	305	be128_xor(r, r, &p[0]);
	306
	307	if (++i >= 16)
	308	break;
	309
	310	gf128mul_x8_ble(r);
	311	}
	312	}
	313	EXPORT_SYMBOL(gf128mul_ble);
	314
	315
c494e070 RS	316	/* This version uses 64k bytes of table space.
c494e070 RS	317	A 16 byte buffer has to be multiplied by a 16 byte key
ce2ace45	318	value in GF(2^128). If we consider a GF(2^128) value in
c494e070 RS	319	the buffer's lowest byte, we can construct a table of
	320	the 256 16 byte values that result from the 256 values
	321	of this byte. This requires 4096 bytes. But we also
	322	need tables for each of the 16 higher bytes in the
	323	buffer as well, which makes 64 kbytes in total.
	324	*/
	325	/* additional explanation
	326	* t[0][BYTE] contains g*BYTE
	327	* t[1][BYTE] contains gx^8BYTE
	328	* ..
	329	* t[15][BYTE] contains gx^120BYTE */
	330	struct gf128mul_64k gf128mul_init_64k_lle(const be128 g)
	331	{
	332	struct gf128mul_64k *t;
	333	int i, j, k;
	334
	335	t = kzalloc(sizeof(*t), GFP_KERNEL);
	336	if (!t)
	337	goto out;
	338
	339	for (i = 0; i < 16; i++) {
	340	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
	341	if (!t->t[i]) {
	342	gf128mul_free_64k(t);
	343	t = NULL;
	344	goto out;
	345	}
	346	}
	347
	348	t->t[0]->t[128] = *g;
	349	for (j = 64; j > 0; j >>= 1)
	350	gf128mul_x_lle(&t->t[0]->t[j], &t->t[0]->t[j + j]);
	351
	352	for (i = 0;;) {
	353	for (j = 2; j < 256; j += j)
	354	for (k = 1; k < j; ++k)
	355	be128_xor(&t->t[i]->t[j + k],
	356	&t->t[i]->t[j], &t->t[i]->t[k]);
	357
	358	if (++i >= 16)
	359	break;
	360
	361	for (j = 128; j > 0; j >>= 1) {
	362	t->t[i]->t[j] = t->t[i - 1]->t[j];
	363	gf128mul_x8_lle(&t->t[i]->t[j]);
	364	}
	365	}
	366
	367	out:
	368	return t;
	369	}
	370	EXPORT_SYMBOL(gf128mul_init_64k_lle);
	371
	372	struct gf128mul_64k gf128mul_init_64k_bbe(const be128 g)
	373	{
	374	struct gf128mul_64k *t;
	375	int i, j, k;
	376
	377	t = kzalloc(sizeof(*t), GFP_KERNEL);
	378	if (!t)
	379	goto out;
	380
	381	for (i = 0; i < 16; i++) {
	382	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
383	if (!t->t[i]) {
384	gf128mul_free_64k(t);
385	t = NULL;
386	goto out;
387	}
388	}
389
390	t->t[0]->t[1] = *g;
391	for (j = 1; j <= 64; j <<= 1)
392	gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
393
394	for (i = 0;;) {
395	for (j = 2; j < 256; j += j)
396	for (k = 1; k < j; ++k)
397	be128_xor(&t->t[i]->t[j + k],
398	&t->t[i]->t[j], &t->t[i]->t[k]);
399
400	if (++i >= 16)
401	break;
402
403	for (j = 128; j > 0; j >>= 1) {
404	t->t[i]->t[j] = t->t[i - 1]->t[j];
405	gf128mul_x8_bbe(&t->t[i]->t[j]);
406	}
407	}
408
409	out:
410	return t;
411	}
412	EXPORT_SYMBOL(gf128mul_init_64k_bbe);
413
414	void gf128mul_free_64k(struct gf128mul_64k *t)
415	{
416	int i;
417
418	for (i = 0; i < 16; i++)
8ea7531e AC	419	kzfree(t->t[i]);
8ea7531e AC	420	kzfree(t);
c494e070 RS	421	}
	422	EXPORT_SYMBOL(gf128mul_free_64k);
	423
	424	void gf128mul_64k_lle(be128 a, struct gf128mul_64k t)
	425	{
	426	u8 ap = (u8 )a;
	427	be128 r[1];
	428	int i;
	429
	430	*r = t->t[0]->t[ap[0]];
	431	for (i = 1; i < 16; ++i)
	432	be128_xor(r, r, &t->t[i]->t[ap[i]]);
	433	a = r;
	434	}
	435	EXPORT_SYMBOL(gf128mul_64k_lle);
	436
	437	void gf128mul_64k_bbe(be128 a, struct gf128mul_64k t)
	438	{
	439	u8 ap = (u8 )a;
	440	be128 r[1];
	441	int i;
	442
	443	*r = t->t[0]->t[ap[15]];
	444	for (i = 1; i < 16; ++i)
	445	be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	446	a = r;
	447	}
	448	EXPORT_SYMBOL(gf128mul_64k_bbe);
	449
	450	/* This version uses 4k bytes of table space.
	451	A 16 byte buffer has to be multiplied by a 16 byte key
ce2ace45	452	value in GF(2^128). If we consider a GF(2^128) value in a
c494e070 RS	453	single byte, we can construct a table of the 256 16 byte
	454	values that result from the 256 values of this byte.
	455	This requires 4096 bytes. If we take the highest byte in
	456	the buffer and use this table to get the result, we then
	457	have to multiply by x^120 to get the final value. For the
	458	next highest byte the result has to be multiplied by x^112
	459	and so on. But we can do this by accumulating the result
	460	in an accumulator starting with the result for the top
	461	byte. We repeatedly multiply the accumulator value by
	462	x^8 and then add in (i.e. xor) the 16 bytes of the next
	463	lower byte in the buffer, stopping when we reach the
	464	lowest byte. This requires a 4096 byte table.
	465	*/
	466	struct gf128mul_4k gf128mul_init_4k_lle(const be128 g)
	467	{
	468	struct gf128mul_4k *t;
	469	int j, k;
	470
	471	t = kzalloc(sizeof(*t), GFP_KERNEL);
	472	if (!t)
	473	goto out;
	474
	475	t->t[128] = *g;
	476	for (j = 64; j > 0; j >>= 1)
	477	gf128mul_x_lle(&t->t[j], &t->t[j+j]);
	478
	479	for (j = 2; j < 256; j += j)
	480	for (k = 1; k < j; ++k)
	481	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	482
	483	out:
	484	return t;
	485	}
	486	EXPORT_SYMBOL(gf128mul_init_4k_lle);
	487
	488	struct gf128mul_4k gf128mul_init_4k_bbe(const be128 g)
	489	{
	490	struct gf128mul_4k *t;
	491	int j, k;
	492
	493	t = kzalloc(sizeof(*t), GFP_KERNEL);
	494	if (!t)
	495	goto out;
	496
	497	t->t[1] = *g;
	498	for (j = 1; j <= 64; j <<= 1)
	499	gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
	500
	501	for (j = 2; j < 256; j += j)
	502	for (k = 1; k < j; ++k)
	503	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	504
	505	out:
	506	return t;
	507	}
	508	EXPORT_SYMBOL(gf128mul_init_4k_bbe);
	509
ce2ace45 AC	510	struct gf128mul_4k gf128mul_init_4k_ble(const be128 g)
	511	{
	512	struct gf128mul_4k *t;
	513	int j, k;
	514
	515	t = kzalloc(sizeof(*t), GFP_KERNEL);
	516	if (!t)
	517	goto out;
	518
	519	t->t[1] = *g;
	520	for (j = 1; j <= 64; j <<= 1)
	521	gf128mul_x_ble(&t->t[j + j], &t->t[j]);
	522
	523	for (j = 2; j < 256; j += j)
	524	for (k = 1; k < j; ++k)
	525	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	526
	527	out:
	528	return t;
	529	}
	530	EXPORT_SYMBOL(gf128mul_init_4k_ble);
	531
c494e070 RS	532	void gf128mul_4k_lle(be128 a, struct gf128mul_4k t)
	533	{
	534	u8 ap = (u8 )a;
	535	be128 r[1];
	536	int i = 15;
	537
	538	*r = t->t[ap[15]];
	539	while (i--) {
	540	gf128mul_x8_lle(r);
	541	be128_xor(r, r, &t->t[ap[i]]);
	542	}
	543	a = r;
	544	}
	545	EXPORT_SYMBOL(gf128mul_4k_lle);
	546
	547	void gf128mul_4k_bbe(be128 a, struct gf128mul_4k t)
	548	{
	549	u8 ap = (u8 )a;
	550	be128 r[1];
	551	int i = 0;
	552
	553	*r = t->t[ap[0]];
	554	while (++i < 16) {
	555	gf128mul_x8_bbe(r);
	556	be128_xor(r, r, &t->t[ap[i]]);
	557	}
	558	a = r;
	559	}
	560	EXPORT_SYMBOL(gf128mul_4k_bbe);
	561
ce2ace45 AC	562	void gf128mul_4k_ble(be128 a, struct gf128mul_4k t)
	563	{
	564	u8 ap = (u8 )a;
	565	be128 r[1];
	566	int i = 15;
	567
	568	*r = t->t[ap[15]];
	569	while (i--) {
	570	gf128mul_x8_ble(r);
	571	be128_xor(r, r, &t->t[ap[i]]);
	572	}
	573	a = r;
	574	}
	575	EXPORT_SYMBOL(gf128mul_4k_ble);
	576
c494e070 RS	577	MODULE_LICENSE("GPL");
c494e070 RS	578	MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");