diff -Nru libmlx4-1.0.6/debian/changelog libmlx4-1.0.6/debian/changelog
--- libmlx4-1.0.6/debian/changelog	2014-05-06 14:36:02.000000000 -0300
+++ libmlx4-1.0.6/debian/changelog	2015-05-07 16:28:54.000000000 -0300
@@ -1,3 +1,9 @@
+libmlx4 (1.0.6-1ubuntu1) vivid; urgency=medium
+
+  * Add checksum offload support capability (LP: #1409904)
+
+ -- Rafael David Tinoco (Inaddy) <inaddy@ubuntu.com>  Wed, 29 Apr 2015 21:14:09 -0300
+
 libmlx4 (1.0.6-1) unstable; urgency=low
 
   * New upstream release.
diff -Nru libmlx4-1.0.6/debian/control libmlx4-1.0.6/debian/control
--- libmlx4-1.0.6/debian/control	2014-05-06 14:36:02.000000000 -0300
+++ libmlx4-1.0.6/debian/control	2015-05-07 16:29:26.000000000 -0300
@@ -1,7 +1,8 @@
 Source: libmlx4
 Priority: extra
-Maintainer: Roland Dreier <rbd@debian.org>
-Build-Depends: debhelper (>= 7.0.50~), dpkg-dev (>= 1.13.19), libibverbs-dev (>= 1.1.8), dh-autoreconf
+Maintainer: Ubuntu Developers <ubuntu-devel-discuss@lists.ubuntu.com>
+XSBC-Original-Maintainer: Roland Dreier <rbd@debian.org>
+Build-Depends: debhelper (>= 7.0.50~), dpkg-dev (>= 1.13.19), libibverbs-dev (>= 1.1.8-1ubuntu2), dh-autoreconf
 Standards-Version: 3.9.5
 Section: libs
 Homepage: http://www.openfabrics.org/
@@ -9,7 +10,7 @@
 Package: libmlx4-1
 Section: libs
 Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}, libibverbs1 (>= 1.1.8)
+Depends: ${shlibs:Depends}, ${misc:Depends}, libibverbs1 (>= 1.1.8-1ubuntu2)
 Description: Userspace driver for Mellanox ConnectX InfiniBand HCAs
  libmlx4 is a device-specific driver for Mellanox ConnectX InfiniBand
  host channel adapters (HCAs) for the libibverbs library.  This allows
diff -Nru libmlx4-1.0.6/debian/patches/Add-checksum-offload-support-capability.patch libmlx4-1.0.6/debian/patches/Add-checksum-offload-support-capability.patch
--- libmlx4-1.0.6/debian/patches/Add-checksum-offload-support-capability.patch	1969-12-31 21:00:00.000000000 -0300
+++ libmlx4-1.0.6/debian/patches/Add-checksum-offload-support-capability.patch	2015-05-07 16:28:30.000000000 -0300
@@ -0,0 +1,767 @@
+Description: Add receive flow steering support
+
+The implementation uses the default commands from libibverbs.
+
+Author: Matan Barak <matanb@mellanox.com>
+Signed-off-by: Matan Barak <matanb@mellanox.com>
+Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+
+Origin: upstream, commit: 1b6875d4c2cba3b751ad0b4286cf52b6a22bff97
+Bug-Ubuntu: https://launchpad.net/bugs/1409904
+Last-Update: 2015-02-09
+
+--- libmlx4-1.0.6.orig/src/qp.c
++++ libmlx4-1.0.6/src/qp.c
+@@ -286,6 +286,10 @@ int mlx4_post_send(struct ibv_qp *ibqp,
+ 			break;
+ 
+ 		case IBV_QPT_UD:
++			ctrl->srcrb_flags |=
++					wr->send_flags & IBV_SEND_IP_CSUM ?
++					htonl(MLX4_WQE_CTRL_IP_CSUM |
++						  MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0;
+ 			set_datagram_seg(wqe, wr);
+ 			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+ 			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+@@ -294,7 +298,12 @@ int mlx4_post_send(struct ibv_qp *ibqp,
+ 		case IBV_QPT_RAW_PACKET:
+ 			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
+ 			 * to indicate that no icrc should be calculated */
+-			ctrl->srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT);
++			ctrl->srcrb_flags |=
++					wr->send_flags & IBV_SEND_IP_CSUM ?
++					htonl(MLX4_WQE_CTRL_IP_CSUM |
++						  MLX4_WQE_CTRL_TCP_UDP_CSUM |
++						  MLX4_WQE_CTRL_SOLICIT) :
++					htonl(MLX4_WQE_CTRL_SOLICIT);
+ 			break;
+ 
+ 		default:
+--- /dev/null
++++ libmlx4-1.0.6/src/qp.c.orig
+@@ -0,0 +1,713 @@
++/*
++ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
++ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
++ * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
++ *
++ * This software is available to you under a choice of one of two
++ * licenses.  You may choose to be licensed under the terms of the GNU
++ * General Public License (GPL) Version 2, available from the file
++ * COPYING in the main directory of this source tree, or the
++ * OpenIB.org BSD license below:
++ *
++ *     Redistribution and use in source and binary forms, with or
++ *     without modification, are permitted provided that the following
++ *     conditions are met:
++ *
++ *      - Redistributions of source code must retain the above
++ *        copyright notice, this list of conditions and the following
++ *        disclaimer.
++ *
++ *      - Redistributions in binary form must reproduce the above
++ *        copyright notice, this list of conditions and the following
++ *        disclaimer in the documentation and/or other materials
++ *        provided with the distribution.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
++ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
++ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
++ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
++ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
++ * SOFTWARE.
++ */
++
++#if HAVE_CONFIG_H
++#  include <config.h>
++#endif /* HAVE_CONFIG_H */
++
++#include <stdlib.h>
++#include <netinet/in.h>
++#include <pthread.h>
++#include <string.h>
++#include <errno.h>
++
++#include "mlx4.h"
++#include "doorbell.h"
++#include "wqe.h"
++
++static const uint32_t mlx4_ib_opcode[] = {
++	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
++	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
++	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
++	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
++	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
++	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
++	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
++};
++
++static void *get_recv_wqe(struct mlx4_qp *qp, int n)
++{
++	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
++}
++
++static void *get_send_wqe(struct mlx4_qp *qp, int n)
++{
++	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
++}
++
++/*
++ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
++ * first four bytes of every 64 byte chunk with 0xffffffff, except for
++ * the very first chunk of the WQE.
++ */
++static void stamp_send_wqe(struct mlx4_qp *qp, int n)
++{
++	uint32_t *wqe = get_send_wqe(qp, n);
++	int i;
++	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
++
++	for (i = 16; i < ds; i += 16)
++		wqe[i] = 0xffffffff;
++}
++
++void mlx4_init_qp_indices(struct mlx4_qp *qp)
++{
++	qp->sq.head	 = 0;
++	qp->sq.tail	 = 0;
++	qp->rq.head	 = 0;
++	qp->rq.tail	 = 0;
++}
++
++void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
++{
++	struct mlx4_wqe_ctrl_seg *ctrl;
++	int i;
++
++	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
++		ctrl = get_send_wqe(qp, i);
++		ctrl->owner_opcode = htonl(1 << 31);
++		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
++
++		stamp_send_wqe(qp, i);
++	}
++}
++
++static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
++{
++	unsigned cur;
++
++	cur = wq->head - wq->tail;
++	if (cur + nreq < wq->max_post)
++		return 0;
++
++	pthread_spin_lock(&cq->lock);
++	cur = wq->head - wq->tail;
++	pthread_spin_unlock(&cq->lock);
++
++	return cur + nreq >= wq->max_post;
++}
++
++static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
++				 uint64_t remote_addr, uint32_t rkey)
++{
++	rseg->raddr    = htonll(remote_addr);
++	rseg->rkey     = htonl(rkey);
++	rseg->reserved = 0;
++}
++
++static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
++{
++	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
++		aseg->swap_add = htonll(wr->wr.atomic.swap);
++		aseg->compare  = htonll(wr->wr.atomic.compare_add);
++	} else {
++		aseg->swap_add = htonll(wr->wr.atomic.compare_add);
++		aseg->compare  = 0;
++	}
++
++}
++
++static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
++			     struct ibv_send_wr *wr)
++{
++	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
++	dseg->dqpn = htonl(wr->wr.ud.remote_qpn);
++	dseg->qkey = htonl(wr->wr.ud.remote_qkey);
++	dseg->vlan = htons(to_mah(wr->wr.ud.ah)->vlan);
++	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
++}
++
++static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
++{
++	dseg->byte_count = htonl(sg->length);
++	dseg->lkey       = htonl(sg->lkey);
++	dseg->addr       = htonll(sg->addr);
++}
++
++static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
++{
++	dseg->lkey       = htonl(sg->lkey);
++	dseg->addr       = htonll(sg->addr);
++
++	/*
++	 * Need a barrier here before writing the byte_count field to
++	 * make sure that all the data is visible before the
++	 * byte_count field is set.  Otherwise, if the segment begins
++	 * a new cacheline, the HCA prefetcher could grab the 64-byte
++	 * chunk and get a valid (!= * 0xffffffff) byte count but
++	 * stale data, and end up sending the wrong data.
++	 */
++	wmb();
++
++	dseg->byte_count = htonl(sg->length);
++}
++
++/*
++ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
++ * implementations may use move-string-buffer assembler instructions,
++ * which do not guarantee order of copying.
++ */
++static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
++{
++	while (bytecnt > 0) {
++		*dst++ = *src++;
++		*dst++ = *src++;
++		bytecnt -= 2 * sizeof (long);
++	}
++}
++
++int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
++			  struct ibv_send_wr **bad_wr)
++{
++	struct mlx4_context *ctx;
++	struct mlx4_qp *qp = to_mqp(ibqp);
++	void *wqe;
++	struct mlx4_wqe_ctrl_seg *ctrl;
++	int ind;
++	int nreq;
++	int inl = 0;
++	int ret = 0;
++	int size;
++	int i;
++
++	pthread_spin_lock(&qp->sq.lock);
++
++	/* XXX check that state is OK to post send */
++
++	ind = qp->sq.head;
++
++	for (nreq = 0; wr; ++nreq, wr = wr->next) {
++		if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
++			ret = ENOMEM;
++			*bad_wr = wr;
++			goto out;
++		}
++
++		if (wr->num_sge > qp->sq.max_gs) {
++			ret = ENOMEM;
++			*bad_wr = wr;
++			goto out;
++		}
++
++		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
++			ret = EINVAL;
++			*bad_wr = wr;
++			goto out;
++		}
++
++		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
++		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
++
++		ctrl->srcrb_flags =
++			(wr->send_flags & IBV_SEND_SIGNALED ?
++			 htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
++			(wr->send_flags & IBV_SEND_SOLICITED ?
++			 htonl(MLX4_WQE_CTRL_SOLICIT) : 0)   |
++			qp->sq_signal_bits;
++
++		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
++		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
++			ctrl->imm = wr->imm_data;
++		else
++			ctrl->imm = 0;
++
++		wqe += sizeof *ctrl;
++		size = sizeof *ctrl / 16;
++
++		switch (ibqp->qp_type) {
++		case IBV_QPT_XRC_SEND:
++			ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
++			/* fall through */
++		case IBV_QPT_RC:
++		case IBV_QPT_UC:
++			switch (wr->opcode) {
++			case IBV_WR_ATOMIC_CMP_AND_SWP:
++			case IBV_WR_ATOMIC_FETCH_AND_ADD:
++				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
++					      wr->wr.atomic.rkey);
++				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
++
++				set_atomic_seg(wqe, wr);
++				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
++				size += (sizeof (struct mlx4_wqe_raddr_seg) +
++					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
++
++				break;
++
++			case IBV_WR_RDMA_READ:
++				inl = 1;
++				/* fall through */
++			case IBV_WR_RDMA_WRITE:
++			case IBV_WR_RDMA_WRITE_WITH_IMM:
++				if (!wr->num_sge)
++					inl = 1;
++				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
++					      wr->wr.rdma.rkey);
++				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
++				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
++
++				break;
++
++			default:
++				/* No extra segments required for sends */
++				break;
++			}
++			break;
++
++		case IBV_QPT_UD:
++			set_datagram_seg(wqe, wr);
++			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
++			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
++			break;
++
++		case IBV_QPT_RAW_PACKET:
++			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
++			 * to indicate that no icrc should be calculated */
++			ctrl->srcrb_flags |= htonl(MLX4_WQE_CTRL_SOLICIT);
++			break;
++
++		default:
++			break;
++		}
++
++		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
++			struct mlx4_wqe_inline_seg *seg;
++			void *addr;
++			int len, seg_len;
++			int num_seg;
++			int off, to_copy;
++
++			inl = 0;
++
++			seg = wqe;
++			wqe += sizeof *seg;
++			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
++			num_seg = 0;
++			seg_len = 0;
++
++			for (i = 0; i < wr->num_sge; ++i) {
++				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
++				len  = wr->sg_list[i].length;
++				inl += len;
++
++				if (inl > qp->max_inline_data) {
++					inl = 0;
++					ret = ENOMEM;
++					*bad_wr = wr;
++					goto out;
++				}
++
++				while (len >= MLX4_INLINE_ALIGN - off) {
++					to_copy = MLX4_INLINE_ALIGN - off;
++					memcpy(wqe, addr, to_copy);
++					len -= to_copy;
++					wqe += to_copy;
++					addr += to_copy;
++					seg_len += to_copy;
++					wmb(); /* see comment below */
++					seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
++					seg_len = 0;
++					seg = wqe;
++					wqe += sizeof *seg;
++					off = sizeof *seg;
++					++num_seg;
++				}
++
++				memcpy(wqe, addr, len);
++				wqe += len;
++				seg_len += len;
++				off += len;
++			}
++
++			if (seg_len) {
++				++num_seg;
++				/*
++				 * Need a barrier here to make sure
++				 * all the data is visible before the
++				 * byte_count field is set.  Otherwise
++				 * the HCA prefetcher could grab the
++				 * 64-byte chunk with this inline
++				 * segment and get a valid (!=
++				 * 0xffffffff) byte count but stale
++				 * data, and end up sending the wrong
++				 * data.
++				 */
++				wmb();
++				seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
++			}
++
++			size += (inl + num_seg * sizeof * seg + 15) / 16;
++		} else {
++			struct mlx4_wqe_data_seg *seg = wqe;
++
++			for (i = wr->num_sge - 1; i >= 0 ; --i)
++				set_data_seg(seg + i, wr->sg_list + i);
++
++			size += wr->num_sge * (sizeof *seg / 16);
++		}
++
++		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
++				    MLX4_WQE_CTRL_FENCE : 0) | size;
++
++		/*
++		 * Make sure descriptor is fully written before
++		 * setting ownership bit (because HW can start
++		 * executing as soon as we do).
++		 */
++		wmb();
++
++		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
++			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
++
++		/*
++		 * We can improve latency by not stamping the last
++		 * send queue WQE until after ringing the doorbell, so
++		 * only stamp here if there are still more WQEs to post.
++		 */
++		if (wr->next)
++			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
++				       (qp->sq.wqe_cnt - 1));
++
++		++ind;
++	}
++
++out:
++	ctx = to_mctx(ibqp->context);
++
++	if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
++		ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
++		*(uint32_t *) ctrl->reserved |= qp->doorbell_qpn;
++		/*
++		 * Make sure that descriptor is written to memory
++		 * before writing to BlueFlame page.
++		 */
++		wmb();
++
++		++qp->sq.head;
++
++		pthread_spin_lock(&ctx->bf_lock);
++
++		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
++			     align(size * 16, 64));
++		wc_wmb();
++
++		ctx->bf_offset ^= ctx->bf_buf_size;
++
++		pthread_spin_unlock(&ctx->bf_lock);
++	} else if (nreq) {
++		qp->sq.head += nreq;
++
++		/*
++		 * Make sure that descriptors are written before
++		 * doorbell record.
++		 */
++		wmb();
++
++		*(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
++	}
++
++	if (nreq)
++		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
++			       (qp->sq.wqe_cnt - 1));
++
++	pthread_spin_unlock(&qp->sq.lock);
++
++	return ret;
++}
++
++int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
++		   struct ibv_recv_wr **bad_wr)
++{
++	struct mlx4_qp *qp = to_mqp(ibqp);
++	struct mlx4_wqe_data_seg *scat;
++	int ret = 0;
++	int nreq;
++	int ind;
++	int i;
++
++	pthread_spin_lock(&qp->rq.lock);
++
++	/* XXX check that state is OK to post receive */
++
++	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
++
++	for (nreq = 0; wr; ++nreq, wr = wr->next) {
++		if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
++			ret = ENOMEM;
++			*bad_wr = wr;
++			goto out;
++		}
++
++		if (wr->num_sge > qp->rq.max_gs) {
++			ret = ENOMEM;
++			*bad_wr = wr;
++			goto out;
++		}
++
++		scat = get_recv_wqe(qp, ind);
++
++		for (i = 0; i < wr->num_sge; ++i)
++			__set_data_seg(scat + i, wr->sg_list + i);
++
++		if (i < qp->rq.max_gs) {
++			scat[i].byte_count = 0;
++			scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
++			scat[i].addr       = 0;
++		}
++
++		qp->rq.wrid[ind] = wr->wr_id;
++
++		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
++	}
++
++out:
++	if (nreq) {
++		qp->rq.head += nreq;
++
++		/*
++		 * Make sure that descriptors are written before
++		 * doorbell record.
++		 */
++		wmb();
++
++		*qp->db = htonl(qp->rq.head & 0xffff);
++	}
++
++	pthread_spin_unlock(&qp->rq.lock);
++
++	return ret;
++}
++
++static int num_inline_segs(int data, enum ibv_qp_type type)
++{
++	/*
++	 * Inline data segments are not allowed to cross 64 byte
++	 * boundaries.  For UD QPs, the data segments always start
++	 * aligned to 64 bytes (16 byte control segment + 48 byte
++	 * datagram segment); for other QPs, there will be a 16 byte
++	 * control segment and possibly a 16 byte remote address
++	 * segment, so in the worst case there will be only 32 bytes
++	 * available for the first data segment.
++	 */
++	if (type == IBV_QPT_UD)
++		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
++			 sizeof (struct mlx4_wqe_datagram_seg)) %
++			MLX4_INLINE_ALIGN;
++	else
++		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
++			 sizeof (struct mlx4_wqe_raddr_seg)) %
++			MLX4_INLINE_ALIGN;
++
++	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
++		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
++}
++
++void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
++			   struct mlx4_qp *qp)
++{
++	int size;
++	int max_sq_sge;
++
++	max_sq_sge	 = align(cap->max_inline_data +
++				 num_inline_segs(cap->max_inline_data, type) *
++				 sizeof (struct mlx4_wqe_inline_seg),
++				 sizeof (struct mlx4_wqe_data_seg)) /
++		sizeof (struct mlx4_wqe_data_seg);
++	if (max_sq_sge < cap->max_send_sge)
++		max_sq_sge = cap->max_send_sge;
++
++	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
++	switch (type) {
++	case IBV_QPT_UD:
++		size += sizeof (struct mlx4_wqe_datagram_seg);
++		break;
++
++	case IBV_QPT_UC:
++		size += sizeof (struct mlx4_wqe_raddr_seg);
++		break;
++
++	case IBV_QPT_XRC_SEND:
++	case IBV_QPT_RC:
++		size += sizeof (struct mlx4_wqe_raddr_seg);
++		/*
++		 * An atomic op will require an atomic segment, a
++		 * remote address segment and one scatter entry.
++		 */
++		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
++			    sizeof (struct mlx4_wqe_raddr_seg) +
++			    sizeof (struct mlx4_wqe_data_seg)))
++			size = (sizeof (struct mlx4_wqe_atomic_seg) +
++				sizeof (struct mlx4_wqe_raddr_seg) +
++				sizeof (struct mlx4_wqe_data_seg));
++		break;
++
++	default:
++		break;
++	}
++
++	/* Make sure that we have enough space for a bind request */
++	if (size < sizeof (struct mlx4_wqe_bind_seg))
++		size = sizeof (struct mlx4_wqe_bind_seg);
++
++	size += sizeof (struct mlx4_wqe_ctrl_seg);
++
++	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
++	     qp->sq.wqe_shift++)
++		; /* nothing */
++}
++
++int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
++		       enum ibv_qp_type type, struct mlx4_qp *qp)
++{
++	qp->rq.max_gs	 = cap->max_recv_sge;
++
++	if (qp->sq.wqe_cnt) {
++		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
++		if (!qp->sq.wrid)
++			return -1;
++	}
++
++	if (qp->rq.wqe_cnt) {
++		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
++		if (!qp->rq.wrid) {
++			free(qp->sq.wrid);
++			return -1;
++		}
++	}
++
++	for (qp->rq.wqe_shift = 4;
++	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
++	     qp->rq.wqe_shift++)
++		; /* nothing */
++
++	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
++		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
++	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
++		qp->rq.offset = 0;
++		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
++	} else {
++		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
++		qp->sq.offset = 0;
++	}
++
++	if (qp->buf_size) {
++		if (mlx4_alloc_buf(&qp->buf,
++				   align(qp->buf_size, to_mdev(context->device)->page_size),
++				   to_mdev(context->device)->page_size)) {
++			free(qp->sq.wrid);
++			free(qp->rq.wrid);
++			return -1;
++		}
++
++		memset(qp->buf.buf, 0, qp->buf_size);
++	} else {
++		qp->buf.buf = NULL;
++	}
++
++	return 0;
++}
++
++void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
++		       enum ibv_qp_type type)
++{
++	int wqe_size;
++
++	wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
++	switch (type) {
++	case IBV_QPT_UD:
++		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
++		break;
++
++	case IBV_QPT_XRC_SEND:
++	case IBV_QPT_UC:
++	case IBV_QPT_RC:
++		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
++		break;
++
++	default:
++		break;
++	}
++
++	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
++	cap->max_send_sge    = qp->sq.max_gs;
++	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
++	cap->max_send_wr     = qp->sq.max_post;
++
++	/*
++	 * Inline data segments can't cross a 64 byte boundary.  So
++	 * subtract off one segment header for each 64-byte chunk,
++	 * taking into account the fact that wqe_size will be 32 mod
++	 * 64 for non-UD QPs.
++	 */
++	qp->max_inline_data  = wqe_size -
++		sizeof (struct mlx4_wqe_inline_seg) *
++		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
++	cap->max_inline_data = qp->max_inline_data;
++}
++
++struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
++{
++	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
++
++	if (ctx->qp_table[tind].refcnt)
++		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
++	else
++		return NULL;
++}
++
++int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
++{
++	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
++
++	if (!ctx->qp_table[tind].refcnt) {
++		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
++						   sizeof (struct mlx4_qp *));
++		if (!ctx->qp_table[tind].table)
++			return -1;
++	}
++
++	++ctx->qp_table[tind].refcnt;
++	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
++	return 0;
++}
++
++void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
++{
++	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
++
++	if (!--ctx->qp_table[tind].refcnt)
++		free(ctx->qp_table[tind].table);
++	else
++		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
++}
+--- libmlx4-1.0.6.orig/src/wqe.h
++++ libmlx4-1.0.6/src/wqe.h
+@@ -41,6 +41,8 @@ enum {
+ 	MLX4_WQE_CTRL_FENCE	= 1 << 6,
+ 	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
+ 	MLX4_WQE_CTRL_SOLICIT	= 1 << 1,
++	MLX4_WQE_CTRL_IP_CSUM	= 1 << 4,
++	MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 << 5
+ };
+ 
+ enum {
diff -Nru libmlx4-1.0.6/debian/patches/series libmlx4-1.0.6/debian/patches/series
--- libmlx4-1.0.6/debian/patches/series	2014-05-06 14:36:02.000000000 -0300
+++ libmlx4-1.0.6/debian/patches/series	2015-05-07 16:28:30.000000000 -0300
@@ -1 +1,2 @@
 driver-plugin-directory.patch
+Add-checksum-offload-support-capability.patch