diff --git a/sys/conf/files b/sys/conf/files index 69a45c41903b..1415e6106467 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -4623,6 +4623,8 @@ dev/mlx5/mlx5_ib/mlx5_ib_cq.c optional mlx5ib pci ofed \ compile-with "${OFED_C}" dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c optional mlx5ib pci ofed \ compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_gsi.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" dev/mlx5/mlx5_ib/mlx5_ib_mad.c optional mlx5ib pci ofed \ compile-with "${OFED_C}" dev/mlx5/mlx5_ib/mlx5_ib_main.c optional mlx5ib pci ofed \ @@ -4633,10 +4635,10 @@ dev/mlx5/mlx5_ib/mlx5_ib_mr.c optional mlx5ib pci ofed \ compile-with "${OFED_C}" dev/mlx5/mlx5_ib/mlx5_ib_qp.c optional mlx5ib pci ofed \ compile-with "${OFED_C}" -dev/mlx5/mlx5_ib/mlx5_ib_roce.c optional mlx5ib pci ofed \ - compile-with "${OFED_C}" dev/mlx5/mlx5_ib/mlx5_ib_srq.c optional mlx5ib pci ofed \ compile-with "${OFED_C}" +dev/mlx5/mlx5_ib/mlx5_ib_virt.c optional mlx5ib pci ofed \ + compile-with "${OFED_C}" dev/mlx5/mlx5_core/mlx5_alloc.c optional mlx5 pci \ compile-with "${OFED_C}" diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h index 2ddae912dd15..fd79edf2da4e 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib.h +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib.h @@ -30,31 +30,33 @@ #include #include +#include #include #include -#include -#include -#include #include #include #include #include #include +#include +#include #define mlx5_ib_dbg(dev, format, arg...) \ -pr_debug("mlx5_dbg:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, curthread->td_proc->p_pid, ##arg) +pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) #define mlx5_ib_err(dev, format, arg...) \ -printf("mlx5_ib: ERR: ""mlx5_err:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, curthread->td_proc->p_pid, ##arg) +pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) #define mlx5_ib_warn(dev, format, arg...) \ -printf("mlx5_ib: WARN: ""mlx5_warn:%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, curthread->td_proc->p_pid, ##arg) -#define BF_ENABLE 0 +pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ + __LINE__, current->pid, ##arg) -extern struct workqueue_struct *mlx5_ib_wq; +#define field_avail(type, fld, sz) (offsetof(type, fld) + \ + sizeof(((type *)0)->fld) <= (sz)) +#define MLX5_IB_DEFAULT_UIDX 0xffffff +#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) enum { MLX5_IB_MMAP_CMD_SHIFT = 8, @@ -66,13 +68,8 @@ enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, MLX5_IB_MMAP_WC_PAGE = 2, MLX5_IB_MMAP_NC_PAGE = 3, - MLX5_IB_MMAP_MAP_DC_INFO_PAGE = 4, - - /* Use EXP mmap commands until it is pushed to upstream */ - MLX5_IB_EXP_MMAP_CORE_CLOCK = 0xFB, - MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA = 0xFC, - MLX5_IB_EXP_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA = 0xFD, - MLX5_IB_EXP_ALLOC_N_MMAP_WC = 0xFE, + /* 5 is chosen in order to be compatible with old versions of libmlx5 */ + MLX5_IB_MMAP_CORE_CLOCK = 5, }; enum { @@ -82,11 +79,6 @@ enum { MLX5_REQ_SCAT_DATA64_CQE = 0x22, }; -enum { - MLX5_DCT_CS_RES_64 = 2, - MLX5_CNAK_RX_POLL_CQ_QUOTA = 256, -}; - enum mlx5_ib_latency_class { MLX5_IB_LATENCY_CLASS_LOW, MLX5_IB_LATENCY_CLASS_MEDIUM, @@ -101,19 +93,17 @@ enum mlx5_ib_mad_ifc_flags { }; enum { - MLX5_CROSS_CHANNEL_UUAR = 0, + MLX5_CROSS_CHANNEL_UUAR = 0, }; enum { - MLX5_IB_MAX_CTX_DYNAMIC_UARS = 256, - MLX5_IB_INVALID_UAR_INDEX = -1U + MLX5_CQE_VERSION_V0, + MLX5_CQE_VERSION_V1, }; -enum { - MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES = 13, - MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES = 6, - MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES = 16, - MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES = 9, +struct mlx5_ib_vma_private_data { + struct list_head list; + struct vm_area_struct *vma; }; struct mlx5_ib_ucontext { @@ -124,9 +114,10 @@ struct mlx5_ib_ucontext { */ struct mutex db_page_mutex; struct mlx5_uuar_info uuari; - u32 dynamic_wc_uar_index[MLX5_IB_MAX_CTX_DYNAMIC_UARS]; + u8 cqe_version; /* Transport Domain number */ u32 tdn; + struct list_head vma_private_list; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -137,32 +128,83 @@ static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibuconte struct mlx5_ib_pd { struct ib_pd ibpd; u32 pdn; - u32 pa_lkey; }; +#define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) +#if (MLX5_IB_FLOW_LAST_PRIO <= 0) +#error "Invalid number of bypass priorities" +#endif +#define MLX5_IB_FLOW_LEFTOVERS_PRIO (MLX5_IB_FLOW_MCAST_PRIO + 1) + +#define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) +#define MLX5_IB_NUM_SNIFFER_FTS 2 +struct mlx5_ib_flow_prio { + struct mlx5_flow_table *flow_table; + unsigned int refcount; +}; + +struct mlx5_ib_flow_handler { + struct list_head list; + struct ib_flow ibflow; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_rule *rule; +}; + +struct mlx5_ib_flow_db { + struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; + struct mlx5_flow_table *lag_demux_ft; + /* Protect flow steering bypass flow tables + * when add/del flow rules. + * only single add/removal of flow steering rule could be done + * simultaneously. + */ + struct mutex lock; +}; + +/* Use macros here so that don't have to duplicate + * enum ib_send_flags and enum ib_qp_type for low-level driver + */ + +#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) + +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END + +#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 +/* + * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI + * creates the actual hardware QP. + */ +#define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 +#define MLX5_IB_WR_UMR IB_WR_RESERVED1 + +/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. + * + * These flags are intended for internal use by the mlx5_ib driver, and they + * rely on the range reserved for that use in the ib_qp_create_flags enum. + */ + +/* Create a UD QP whose source QP number is 1 */ +static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void) +{ + return IB_QP_CREATE_RESERVED_START; +} + struct wr_list { u16 opcode; u16 next; }; -struct mlx5_swr_ctx { - u64 wrid; - u32 wr_data; - struct wr_list w_list; - u32 wqe_head; - u8 sig_piped; - u8 rsvd[11]; -}; - -struct mlx5_rwr_ctx { - u64 wrid; -}; - struct mlx5_ib_wq { - union { - struct mlx5_swr_ctx *swr_ctx; - struct mlx5_rwr_ctx *rwr_ctx; - }; + u64 *wrid; + u32 *wr_data; + struct wr_list *w_list; + unsigned *wqe_head; u16 unsig_count; /* serialize post to the work queue @@ -180,6 +222,25 @@ struct mlx5_ib_wq { void *qend; }; +struct mlx5_ib_rwq { + struct ib_wq ibwq; + struct mlx5_core_qp core_qp; + u32 rq_num_pas; + u32 log_rq_stride; + u32 log_rq_size; + u32 rq_page_offset; + u32 log_page_size; + struct ib_umem *umem; + size_t buf_size; + unsigned int page_shift; + int create_type; + struct mlx5_db db; + u32 user_index; + u32 wqe_count; + u32 wqe_shift; + int wq_sig; +}; + enum { MLX5_QP_USER, MLX5_QP_KERNEL, @@ -191,60 +252,103 @@ enum { MLX5_WQ_KERNEL }; -struct mlx5_ib_sqd { - struct mlx5_ib_qp *qp; - struct work_struct work; +struct mlx5_ib_rwq_ind_table { + struct ib_rwq_ind_table ib_rwq_ind_tbl; + u32 rqtn; }; -struct mlx5_ib_mc_flows_list { - struct list_head flows_list; - /*Protect the flows_list*/ - struct mutex lock; +/* + * Connect-IB can trigger up to four concurrent pagefaults + * per-QP. + */ +enum mlx5_ib_pagefault_context { + MLX5_IB_PAGEFAULT_RESPONDER_READ, + MLX5_IB_PAGEFAULT_REQUESTOR_READ, + MLX5_IB_PAGEFAULT_RESPONDER_WRITE, + MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, + MLX5_IB_PAGEFAULT_CONTEXTS +}; + +static inline enum mlx5_ib_pagefault_context + mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) +{ + return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); +} + +struct mlx5_ib_pfault { + struct work_struct work; + struct mlx5_pagefault mpfault; +}; + +struct mlx5_ib_ubuffer { + struct ib_umem *umem; + int buf_size; + u64 buf_addr; +}; + +struct mlx5_ib_qp_base { + struct mlx5_ib_qp *container_mibqp; + struct mlx5_core_qp mqp; + struct mlx5_ib_ubuffer ubuffer; +}; + +struct mlx5_ib_qp_trans { + struct mlx5_ib_qp_base base; + u16 xrcdn; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; +}; + +struct mlx5_ib_rss_qp { + u32 tirn; +}; + +struct mlx5_ib_rq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *rq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tirn; + u8 state; +}; + +struct mlx5_ib_sq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *sq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tisn; + u8 state; +}; + +struct mlx5_ib_raw_packet_qp { + struct mlx5_ib_sq sq; + struct mlx5_ib_rq rq; }; struct mlx5_ib_qp { struct ib_qp ibqp; - struct mlx5_core_qp mqp; - struct mlx5_core_qp mrq; - struct mlx5_core_qp msq; - u32 tisn; - u32 tirn; + union { + struct mlx5_ib_qp_trans trans_qp; + struct mlx5_ib_raw_packet_qp raw_packet_qp; + struct mlx5_ib_rss_qp rss_qp; + }; struct mlx5_buf buf; struct mlx5_db db; struct mlx5_ib_wq rq; - u32 doorbell_qpn; u8 sq_signal_bits; u8 fm_cache; - int sq_max_wqes_per_wr; - int sq_spare_wqes; struct mlx5_ib_wq sq; - struct ib_umem *umem; - int buf_size; - /* Raw Ethernet QP's SQ is allocated seperately - * from the RQ's buffer in user-space. - */ - struct ib_umem *sq_umem; - int sq_buf_size; - u64 sq_buf_addr; - int allow_mp_wqe; - /* serialize qp state modifications */ struct mutex mutex; - u16 xrcdn; u32 flags; u8 port; - u8 alt_port; - u8 atomic_rd_en; - u8 resp_depth; u8 state; - /* Raw Ethernet QP's SQ and RQ states */ - u8 rq_state; - u8 sq_state; - int mlx_type; int wq_sig; int scat_cqe; int max_inline_data; @@ -257,16 +361,27 @@ struct mlx5_ib_qp { int uuarn; int create_type; - u32 pa_lkey; /* Store signature errors */ bool signature_en; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * A flag that is true for QP's that are in a state that doesn't + * allow page faults, and shouldn't schedule any more faults. + */ + int disable_page_faults; + /* + * The disable_page_faults_lock protects a QP's disable_page_faults + * field, allowing for a thread to atomically check whether the QP + * allows page faults, and if so schedule a page fault. + */ + spinlock_t disable_page_faults_lock; + struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; +#endif struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; - - struct mlx5_ib_mc_flows_list mc_flows_list; }; struct mlx5_ib_cq_buf { @@ -277,12 +392,20 @@ struct mlx5_ib_cq_buf { }; enum mlx5_ib_qp_flags { - MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, - MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, - MLX5_IB_QP_CAP_RX_END_PADDING = 1 << 5, + MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, + MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, + MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, + MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, + /* QP uses 1 as its source QP number */ + MLX5_IB_QP_SQPN_QP1 = 1 << 6, + MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, + MLX5_IB_QP_RSS = 1 << 8, }; struct mlx5_umr_wr { + struct ib_send_wr wr; union { u64 virt_addr; u64 offset; @@ -290,11 +413,16 @@ struct mlx5_umr_wr { struct ib_pd *pd; unsigned int page_shift; unsigned int npages; - u64 length; + u32 length; int access_flags; u32 mkey; }; +static inline struct mlx5_umr_wr *umr_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct mlx5_umr_wr, wr); +} + struct mlx5_shared_mr_info { int mr_id; struct ib_umem *umem; @@ -316,8 +444,17 @@ struct mlx5_ib_cq { struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; - struct list_head list_send_qp; - struct list_head list_recv_qp; + struct list_head list_send_qp; + struct list_head list_recv_qp; + u32 create_flags; + struct list_head wc_list; + enum ib_cq_notify_flags notify_flags; + struct work_struct notify_work; +}; + +struct mlx5_ib_wc { + struct ib_wc wc; + struct list_head list; }; struct mlx5_ib_srq { @@ -353,44 +490,45 @@ enum mlx5_ib_mtt_access_flags { struct mlx5_ib_mr { struct ib_mr ibmr; - struct mlx5_core_mr mmr; + void *descs; + dma_addr_t desc_map; + int ndescs; + int max_descs; + int desc_size; + int access_mode; + struct mlx5_core_mr mmkey; struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; int order; int umred; - dma_addr_t dma; int npages; struct mlx5_ib_dev *dev; - struct mlx5_create_mkey_mbox_out out; + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; - u32 max_reg_descriptors; - u64 size; - u64 page_count; - struct mlx5_ib_mr **children; - int nchild; + int live; + void *descs_alloc; + int access_flags; /* Needed for rereg MR */ }; -struct mlx5_ib_fast_reg_page_list { - struct ib_fast_reg_page_list ibfrpl; - __be64 *mapped_page_list; - dma_addr_t map; +struct mlx5_ib_mw { + struct ib_mw ibmw; + struct mlx5_core_mr mmkey; }; struct mlx5_ib_umr_context { + struct ib_cqe cqe; enum ib_wc_status status; struct completion done; }; -static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) -{ - context->status = -1; - init_completion(&context->done); -} - struct umr_common { struct ib_pd *pd; - struct ib_mr *mr; + struct ib_cq *cq; + struct ib_qp *qp; + /* control access to UMR QP + */ + struct semaphore sem; }; enum { @@ -399,27 +537,6 @@ enum { MLX5_FMR_BUSY, }; -struct mlx5_ib_fmr { - struct ib_fmr ibfmr; - struct mlx5_core_mr mr; - int access_flags; - int state; - /* protect fmr state - */ - spinlock_t lock; - u64 wrid; - struct ib_send_wr wr[2]; - u8 page_shift; - struct ib_fast_reg_page_list page_list; -}; - -struct cache_order { - struct kobject kobj; - int order; - int index; - struct mlx5_ib_dev *dev; -}; - struct mlx5_cache_ent { struct list_head head; /* sync access to the cahce entry @@ -427,17 +544,23 @@ struct mlx5_cache_ent { spinlock_t lock; + struct dentry *dir; + char name[4]; u32 order; u32 size; u32 cur; u32 miss; u32 limit; + struct dentry *fsize; + struct dentry *fcur; + struct dentry *fmiss; + struct dentry *flimit; + struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; int pending; - struct cache_order co; }; struct mlx5_mr_cache { @@ -445,9 +568,15 @@ struct mlx5_mr_cache { struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; int stopped; struct dentry *root; - int last_add; - int rel_timeout; - int rel_imm; + unsigned long last_add; +}; + +struct mlx5_ib_gsi_qp; + +struct mlx5_ib_port_resources { + struct mlx5_ib_resources *devr; + struct mlx5_ib_gsi_qp *gsi; + struct work_struct pkey_change_work; }; struct mlx5_ib_resources { @@ -457,106 +586,58 @@ struct mlx5_ib_resources { struct ib_pd *p0; struct ib_srq *s0; struct ib_srq *s1; + struct mlx5_ib_port_resources ports[2]; + /* Protects changes to the port resources */ + struct mutex mutex; }; -struct mlx5_dc_tracer { - struct page *pg; - dma_addr_t dma; - int size; - int order; -}; - -struct mlx5_dc_desc { - dma_addr_t dma; - void *buf; -}; - -enum mlx5_op { - MLX5_WR_OP_MLX = 1, -}; - -struct mlx5_mlx_wr { - u8 sl; - u16 dlid; - int icrc; -}; - -struct mlx5_send_wr { - struct ib_send_wr wr; - union { - struct mlx5_mlx_wr mlx; - } sel; -}; - -struct mlx5_dc_data { - struct ib_mr *mr; - struct ib_qp *dcqp; - struct ib_cq *rcq; - struct ib_cq *scq; - unsigned int rx_npages; - unsigned int tx_npages; - struct mlx5_dc_desc *rxdesc; - struct mlx5_dc_desc *txdesc; - unsigned int max_wqes; - unsigned int cur_send; - unsigned int last_send_completed; - int tx_pending; - struct mlx5_ib_dev *dev; - int port; - int initialized; - struct kobject kobj; - unsigned long connects; - unsigned long cnaks; - unsigned long discards; - struct ib_wc wc_tbl[MLX5_CNAK_RX_POLL_CQ_QUOTA]; -}; - -struct mlx5_ib_port_sysfs_group { - struct kobject kobj; - bool enabled; - struct attribute_group counters; -}; - -#define MLX5_IB_GID_MAX 16 - struct mlx5_ib_port { - struct mlx5_ib_dev *dev; - u8 port_num; /* 0 based */ - u8 port_gone; /* set when gone */ u16 q_cnt_id; - struct mlx5_ib_port_sysfs_group group; - union ib_gid gid_table[MLX5_IB_GID_MAX]; +}; + +struct mlx5_roce { + /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL + * netdev pointer + */ + rwlock_t netdev_lock; + struct net_device *netdev; + struct notifier_block nb; + atomic_t next_port; }; struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_roce roce; MLX5_DECLARE_DOORBELL_LOCK(uar_lock); int num_ports; /* serialize update of capability mask */ struct mutex cap_mask_mutex; - bool ib_active; + bool ib_active; struct umr_common umrc; /* sync used page count stats */ struct mlx5_ib_resources devr; - struct mutex slow_path_mutex; - int enable_atomic_resp; - enum ib_atomic_cap atomic_cap; struct mlx5_mr_cache cache; - struct kobject mr_cache; + struct timer_list delay_timer; + /* Prevents soft lock on massive reg MRs */ + struct mutex slow_path_mutex; + int fill_delay; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct ib_odp_caps odp_caps; + /* + * Sleepable RCU that prevents destruction of MRs while they are still + * being used by a page fault handler. + */ + struct srcu_struct mr_srcu; +#endif + struct mlx5_ib_flow_db flow_db; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; - struct list_head qp_list; - struct timer_list delay_timer; - int fill_delay; - struct mlx5_dc_tracer dctr; - struct mlx5_dc_data dcd[MLX5_MAX_PORTS]; - struct kobject *dc_kobj; + struct list_head qp_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; - struct kobject *ports_parent; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -574,11 +655,6 @@ static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev) return container_of(ibdev, struct mlx5_ib_dev, ib_dev); } -static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr); -} - static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx5_ib_cq, ibcq); @@ -586,22 +662,17 @@ static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) { - return container_of(mqp, struct mlx5_ib_qp, mqp); + return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } -static inline struct mlx5_ib_qp *sq_to_mibqp(struct mlx5_core_qp *msq) +static inline struct mlx5_ib_rwq *to_mibrwq(struct mlx5_core_qp *core_qp) { - return container_of(msq, struct mlx5_ib_qp, msq); + return container_of(core_qp, struct mlx5_ib_rwq, core_qp); } -static inline struct mlx5_ib_qp *rq_to_mibqp(struct mlx5_core_qp *mrq) +static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmkey) { - return container_of(mrq, struct mlx5_ib_qp, mrq); -} - -static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) -{ - return container_of(mmr, struct mlx5_ib_mr, mmr); + return container_of(mmkey, struct mlx5_ib_mr, mmkey); } static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd) @@ -619,6 +690,16 @@ static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp) return container_of(ibqp, struct mlx5_ib_qp, ibqp); } +static inline struct mlx5_ib_rwq *to_mrwq(struct ib_wq *ibwq) +{ + return container_of(ibwq, struct mlx5_ib_rwq, ibwq); +} + +static inline struct mlx5_ib_rwq_ind_table *to_mrwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + return container_of(ib_rwq_ind_tbl, struct mlx5_ib_rwq_ind_table, ib_rwq_ind_tbl); +} + static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq) { return container_of(msrq, struct mlx5_ib_srq, msrq); @@ -629,9 +710,9 @@ static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr) return container_of(ibmr, struct mlx5_ib_mr, ibmr); } -static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl) +static inline struct mlx5_ib_mw *to_mmw(struct ib_mw *ibmw) { - return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl); + return container_of(ibmw, struct mlx5_ib_mw, ibmw); } struct mlx5_ib_ah { @@ -644,17 +725,15 @@ static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) return container_of(ibah, struct mlx5_ib_ah, ibah); } -int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, uintptr_t virt, +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db); void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db); void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, - u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, - void *in_mad, void *response_mad); -struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, struct ib_ah_attr *ah_attr, - struct mlx5_ib_ah *ah, enum rdma_link_layer ll); + u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const void *in_mad, void *response_mad); struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx5_ib_destroy_ah(struct ib_ah *ah); @@ -680,62 +759,62 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length, + struct mlx5_ib_qp_base *base); struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - struct ib_cq_init_attr *attr, + const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int mlx5_ib_destroy_cq(struct ib_cq *cq); int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); -int mlx5_ib_modify_cq(struct ib_cq *cq, struct ib_cq_attr *attr, int cq_attr_mask); +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, - struct ib_udata *udata, int mr_id); -struct ib_mr *mlx5_ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int access_flags, - u64 *virt_addr); + struct ib_udata *udata); +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata); +int mlx5_ib_dealloc_mw(struct ib_mw *mw); +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, + int npages, int zap); +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int access_flags, + struct ib_pd *pd, struct ib_udata *udata); int mlx5_ib_dereg_mr(struct ib_mr *ibmr); -int mlx5_ib_destroy_mr(struct ib_mr *ibmr); -struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, - int max_page_list_len); -struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, - int page_list_len); -void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); - -struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc, - struct ib_fmr_attr *fmr_attr); -int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int npages, u64 iova); -int mlx5_ib_unmap_fmr(struct list_head *fmr_list); -int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr); +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, struct ib_mad *out_mad); + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index); struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata); int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd); int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset); int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port); -int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, struct ib_smp *out_mad); -int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid); -int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys); -int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, u32 *vendor_id); -int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, +int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc); +int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid); +int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); -int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc); -int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid); -int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, +int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid); -int mlx5_query_port_mad_ifc(struct ib_device *ibdev, u8 port, +int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); @@ -743,29 +822,88 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int *ncont, int *order); +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, + __be64 *pas, int access_flags); void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, __be64 *pas, int umr); + int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); -int mlx5_query_port_roce(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props); -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port, int index, - __be16 ah_udp_s_port); -int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port, - int index, int *gid_type); -struct net_device *mlx5_ib_get_netdev(struct ib_device *ib_dev, u8 port); -int modify_gid_roce(struct ib_device *ib_dev, u8 port, unsigned int index, - const union ib_gid *gid, struct net_device *ndev); -int query_gid_roce(struct ib_device *ib_dev, u8 port, int index, - union ib_gid *gid); -int mlx5_process_mad_mad_ifc(struct ib_device *ibdev, int mad_flags, - u8 port_num, struct ib_wc *in_wc, - struct ib_grh *in_grh, struct ib_mad *in_mad, - struct ib_mad *out_mad); +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status); +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_wq(struct ib_wq *wq); +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +extern struct workqueue_struct *mlx5_ib_page_fault_wq; + +void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); +void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, + struct mlx5_ib_pfault *pfault); +void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); +int __init mlx5_ib_odp_init(void); +void mlx5_ib_odp_cleanup(void); +void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); +void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, + unsigned long end); +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) +{ + return; +} + +static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} +static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline int mlx5_ib_odp_init(void) { return 0; } +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} +static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, + u8 port, struct ifla_vf_info *info); +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u8 port, int state); +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u8 port, struct ifla_vf_stats *stats); +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, + u64 guid, int type); + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, + int index); + +/* GSI QP helper functions */ +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr); +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp); +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask); +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); + +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); static inline void init_query_mad(struct ib_smp *mad) { @@ -784,7 +922,72 @@ static inline u8 convert_access(int acc) MLX5_PERM_LOCAL_READ; } +static inline int is_qp1(enum ib_qp_type qp_type) +{ + return qp_type == MLX5_IB_QPT_HW_GSI; +} + #define MLX5_MAX_UMR_SHIFT 16 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) +static inline u32 check_cq_create_flags(u32 flags) +{ + /* + * It returns non-zero value for unsupported CQ + * create flags, otherwise it returns zero. + */ + return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | + IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); +} + +static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, + u32 *user_index) +{ + if (cqe_version) { + if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || + (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) + return -EINVAL; + *user_index = cmd_uidx; + } else { + *user_index = MLX5_IB_DEFAULT_UIDX; + } + + return 0; +} + +static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_qp *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_srq *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} #endif /* MLX5_IB_H */ diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c index 73947b6b06ad..fc8d71355644 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_ah.c @@ -27,15 +27,11 @@ #include "mlx5_ib.h" -#define IPV6_DEFAULT_HOPLIMIT 64 - -struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, - struct ib_ah_attr *ah_attr, - struct mlx5_ib_ah *ah, enum rdma_link_layer ll) +static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, + struct mlx5_ib_ah *ah, + struct ib_ah_attr *ah_attr, + enum rdma_link_layer ll) { - int err; - int gid_type; - if (ah_attr->ah_flags & IB_AH_GRH) { memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label | @@ -48,21 +44,12 @@ struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, ah->av.stat_rate_sl = (ah_attr->static_rate << 4); if (ll == IB_LINK_LAYER_ETHERNET) { - err = mlx5_get_roce_gid_type(dev, ah_attr->port_num, - ah_attr->grh.sgid_index, - &gid_type); - if (err) - return ERR_PTR(err); - memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac)); - ah->av.udp_sport = mlx5_get_roce_udp_sport( - dev, - ah_attr->port_num, - ah_attr->grh.sgid_index, - 0); + ah->av.udp_sport = + mlx5_get_roce_udp_sport(dev, + ah_attr->port_num, + ah_attr->grh.sgid_index); ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1; - ah->av.hop_limit = ah_attr->grh.hop_limit; - /* TODO: initialize other eth fields */ } else { ah->av.rlid = cpu_to_be16(ah_attr->dlid); ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; @@ -77,22 +64,17 @@ struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) struct mlx5_ib_ah *ah; struct mlx5_ib_dev *dev = to_mdev(pd->device); enum rdma_link_layer ll; - struct ib_ah *ret = ERR_PTR(-EINVAL); + + ll = pd->device->get_link_layer(pd->device, ah_attr->port_num); + + if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH)) + return ERR_PTR(-EINVAL); ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); - ll = pd->device->get_link_layer(pd->device, ah_attr->port_num); - - if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH)) - goto err_kfree_ah; - - return create_ib_ah(dev, ah_attr, ah, ll); /* never fails */ - -err_kfree_ah: - kfree(ah); - return ret; + return create_ib_ah(dev, ah, ah_attr, ll); /* never fails */ } int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c index 001e54d0be5f..52382b80c4fa 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c @@ -28,8 +28,8 @@ #include #include #include +#include #include "mlx5_ib.h" -#include "user.h" static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq) { @@ -96,15 +96,18 @@ static void *next_cqe_sw(struct mlx5_ib_cq *cq) static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx) { - switch (wq->swr_ctx[idx].wr_data) { + switch (wq->wr_data[idx]) { + case MLX5_IB_WR_UMR: + return 0; + case IB_WR_LOCAL_INV: return IB_WC_LOCAL_INV; - case IB_WR_FAST_REG_MR: - return IB_WC_FAST_REG_MR; + case IB_WR_REG_MR: + return IB_WC_REG_MR; default: - printf("mlx5_ib: WARN: ""unknown completion status\n"); + pr_warn("unknown completion status\n"); return 0; } } @@ -121,7 +124,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, break; case MLX5_OPCODE_SEND_IMM: wc->wc_flags |= IB_WC_WITH_IMM; - case MLX5_OPCODE_NOP: case MLX5_OPCODE_SEND: case MLX5_OPCODE_SEND_INVAL: wc->opcode = IB_WC_SEND; @@ -146,9 +148,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, wc->opcode = IB_WC_MASKED_FETCH_ADD; wc->byte_len = 8; break; - case MLX5_OPCODE_BIND_MW: - wc->opcode = IB_WC_BIND_MW; - break; case MLX5_OPCODE_UMR: wc->opcode = get_umr_comp(wq, idx); break; @@ -163,14 +162,12 @@ enum { static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, struct mlx5_ib_qp *qp) { + enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1); struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); struct mlx5_ib_srq *srq; struct mlx5_ib_wq *wq; u16 wqe_ctr; u8 g; -#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) - u8 udp_header_valid; -#endif if (qp->ibqp.srq || qp->ibqp.xrcd) { struct mlx5_core_srq *msrq = NULL; @@ -191,7 +188,7 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, } } else { wq = &qp->rq; - wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } wc->byte_len = be32_to_cpu(cqe->byte_cnt); @@ -204,7 +201,10 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, break; case MLX5_CQE_RESP_SEND: wc->opcode = IB_WC_RECV; - wc->wc_flags = 0; + wc->wc_flags = IB_WC_IP_CSUM_OK; + if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) && + (cqe->hds_ip_ext & CQE_L4_OK)))) + wc->wc_flags = 0; break; case MLX5_CQE_RESP_SEND_IMM: wc->opcode = IB_WC_RECV; @@ -223,14 +223,30 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, wc->dlid_path_bits = cqe->ml_path; g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3; wc->wc_flags |= g ? IB_WC_GRH : 0; - wc->pkey_index = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; + if (unlikely(is_qp1(qp->ibqp.qp_type))) { + u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff; -#if defined(DX_ROCE_V1_5) || defined(DX_WINDOWS) - udp_header_valid = wc->sl & 0x8; - if (udp_header_valid) - wc->wc_flags |= IB_WC_WITH_UDP_HDR; + ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey, + &wc->pkey_index); + } else { + wc->pkey_index = 0; + } -#endif + if (ll != IB_LINK_LAYER_ETHERNET) + return; + + switch (wc->sl & 0x3) { + case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH: + wc->network_hdr_type = RDMA_NETWORK_IB; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6: + wc->network_hdr_type = RDMA_NETWORK_IPV6; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4: + wc->network_hdr_type = RDMA_NETWORK_IPV4; + break; + } + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; } static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) @@ -240,7 +256,9 @@ static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) mlx5_ib_warn(dev, "dump error cqe\n"); for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4) - printf("mlx5_ib: INFO: ""%08x %08x %08x %08x\n", be32_to_cpu(p[0]), be32_to_cpu(p[1]), be32_to_cpu(p[2]), be32_to_cpu(p[3])); + pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]), + be32_to_cpu(p[1]), be32_to_cpu(p[2]), + be32_to_cpu(p[3])); } static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, @@ -302,14 +320,14 @@ static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev, dump_cqe(dev, cqe); } -static int is_atomic_response(struct mlx5_ib_qp *qp, u16 idx) +static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx) { /* TBD: waiting decision */ return 0; } -static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, u16 idx) +static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx) { struct mlx5_wqe_data_seg *dpseg; void *addr; @@ -317,12 +335,12 @@ static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, u16 idx) dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_raddr_seg) + sizeof(struct mlx5_wqe_atomic_seg); - addr = (void *)(uintptr_t)be64_to_cpu(dpseg->addr); + addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr); return addr; } static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, - u16 idx) + uint16_t idx) { void *addr; int byte_count; @@ -335,10 +353,10 @@ static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, addr = mlx5_get_atomic_laddr(qp, idx); if (byte_count == 4) { - *(u32 *)addr = be32_to_cpu(*((__be32 *)addr)); + *(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr)); } else { for (i = 0; i < byte_count; i += 8) { - *(u64 *)addr = be64_to_cpu(*((__be64 *)addr)); + *(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr)); addr += 8; } } @@ -357,9 +375,9 @@ static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64, if (idx == head) break; - tail = qp->sq.swr_ctx[idx].w_list.next; + tail = qp->sq.w_list[idx].next; } while (1); - tail = qp->sq.swr_ctx[idx].w_list.next; + tail = qp->sq.w_list[idx].next; qp->sq.last_poll = tail; } @@ -368,12 +386,44 @@ static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf) mlx5_buf_free(dev->mdev, &buf->buf); } +static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe, + struct ib_sig_err *item) +{ + u16 syndrome = be16_to_cpu(cqe->syndrome); + +#define GUARD_ERR (1 << 13) +#define APPTAG_ERR (1 << 12) +#define REFTAG_ERR (1 << 11) + + if (syndrome & GUARD_ERR) { + item->err_type = IB_SIG_BAD_GUARD; + item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16; + item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16; + } else + if (syndrome & REFTAG_ERR) { + item->err_type = IB_SIG_BAD_REFTAG; + item->expected = be32_to_cpu(cqe->expected_reftag); + item->actual = be32_to_cpu(cqe->actual_reftag); + } else + if (syndrome & APPTAG_ERR) { + item->err_type = IB_SIG_BAD_APPTAG; + item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff; + item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff; + } else { + pr_err("Got signature completion error with bad syndrome %04x\n", + syndrome); + } + + item->sig_err_offset = be64_to_cpu(cqe->err_offset); + item->key = be32_to_cpu(cqe->mkey); +} + static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc, int *npolled) { struct mlx5_ib_wq *wq; - unsigned cur; - unsigned idx; + unsigned int cur; + unsigned int idx; int np; int i; @@ -386,14 +436,14 @@ static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries, for (i = 0; i < cur && np < num_entries; i++) { idx = wq->last_poll & (wq->wqe_cnt - 1); - wc->wr_id = wq->swr_ctx[idx].wrid; + wc->wr_id = wq->wrid[idx]; wc->status = IB_WC_WR_FLUSH_ERR; wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; wq->tail++; np++; wc->qp = &qp->ibqp; wc++; - wq->last_poll = wq->swr_ctx[idx].w_list.next; + wq->last_poll = wq->w_list[idx].next; } *npolled = np; } @@ -402,7 +452,7 @@ static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries, struct ib_wc *wc, int *npolled) { struct mlx5_ib_wq *wq; - unsigned cur; + unsigned int cur; int np; int i; @@ -414,7 +464,7 @@ static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries, return; for (i = 0; i < cur && np < num_entries; i++) { - wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; wc->status = IB_WC_WR_FLUSH_ERR; wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; wq->tail++; @@ -445,11 +495,6 @@ static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries, } } -static inline u32 mlx5_ib_base_mkey(const u32 key) -{ - return key & 0xffffff00u; -} - static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_ib_qp **cur_qp, struct ib_wc *wc) @@ -460,11 +505,11 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; struct mlx5_sig_err_cqe *sig_err_cqe; - struct mlx5_core_mr *mmr; + struct mlx5_core_mr *mmkey; struct mlx5_ib_mr *mr; unsigned long flags; - u8 opcode; - u32 qpn; + uint8_t opcode; + uint32_t qpn; u16 wqe_ctr; void *cqe; int idx; @@ -503,12 +548,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, * from the table. */ mqp = __mlx5_qp_lookup(dev->mdev, qpn); - if (unlikely(!mqp)) { - mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n", - cq->mcq.cqn, qpn); - return -EINVAL; - } - *cur_qp = to_mibqp(mqp); } @@ -520,13 +559,9 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, idx = wqe_ctr & (wq->wqe_cnt - 1); handle_good_req(wc, cqe64, wq, idx); handle_atomics(*cur_qp, cqe64, wq->last_poll, idx); - wc->wr_id = wq->swr_ctx[idx].wrid; - wq->tail = wq->swr_ctx[idx].wqe_head + 1; - if (unlikely(wq->swr_ctx[idx].w_list.opcode & - MLX5_OPCODE_SIGNATURE_CANCELED)) - wc->status = IB_WC_GENERAL_ERR; - else - wc->status = IB_WC_SUCCESS; + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; + wc->status = IB_WC_SUCCESS; break; case MLX5_CQE_RESP_WR_IMM: case MLX5_CQE_RESP_SEND: @@ -550,8 +585,8 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, wq = &(*cur_qp)->sq; wqe_ctr = be16_to_cpu(cqe64->wqe_counter); idx = wqe_ctr & (wq->wqe_cnt - 1); - wc->wr_id = wq->swr_ctx[idx].wrid; - wq->tail = wq->swr_ctx[idx].wqe_head + 1; + wc->wr_id = wq->wrid[idx]; + wq->tail = wq->wqe_head[idx] + 1; } else { struct mlx5_ib_srq *srq; @@ -562,7 +597,7 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, mlx5_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; - wc->wr_id = wq->rwr_ctx[wq->tail & (wq->wqe_cnt - 1)].wrid; + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } } @@ -571,20 +606,19 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; spin_lock_irqsave(&dev->mdev->priv.mr_table.lock, flags); - mmr = __mlx5_mr_lookup(dev->mdev, - mlx5_ib_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); - if (unlikely(!mmr)) { - spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags); - mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n", - cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey)); - return -EINVAL; - } - - mr = to_mibmr(mmr); + mmkey = __mlx5_mr_lookup(dev->mdev, + mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); + mr = to_mibmr(mmkey); + get_sig_err_item(sig_err_cqe, &mr->sig->err_item); mr->sig->sig_err_exists = true; mr->sig->sigerr_count++; - mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR\n", cq->mcq.cqn); + mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", + cq->mcq.cqn, mr->sig->err_item.key, + mr->sig->err_item.err_type, + (long long)mr->sig->err_item.sig_err_offset, + mr->sig->err_item.expected, + mr->sig->err_item.actual); spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags); goto repoll; @@ -593,6 +627,28 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, return 0; } +static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, + struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); + struct mlx5_ib_wc *soft_wc, *next; + int npolled = 0; + + list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) { + if (npolled >= num_entries) + break; + + mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", + cq->mcq.cqn); + + wc[npolled++] = soft_wc->wc; + list_del(&soft_wc->list); + kfree(soft_wc); + } + + return npolled; +} + int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct mlx5_ib_cq *cq = to_mcq(ibcq); @@ -600,8 +656,8 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); struct mlx5_core_dev *mdev = dev->mdev; unsigned long flags; + int soft_polled = 0; int npolled; - int err = 0; spin_lock_irqsave(&cq->lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { @@ -609,9 +665,11 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) goto out; } - for (npolled = 0; npolled < num_entries; npolled++) { - err = mlx5_poll_one(cq, &cur_qp, wc + npolled); - if (err) + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc); + + for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { + if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled)) break; } @@ -620,26 +678,33 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) out: spin_unlock_irqrestore(&cq->lock, flags); - if (err == 0 || err == -EAGAIN) - return npolled; - else - return err; + return soft_polled + npolled; } int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; + struct mlx5_ib_cq *cq = to_mcq(ibcq); void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + unsigned long irq_flags; + int ret = 0; + spin_lock_irqsave(&cq->lock, irq_flags); + if (cq->notify_flags != IB_CQ_NEXT_COMP) + cq->notify_flags = flags & IB_CQ_SOLICITED_MASK; - mlx5_cq_arm(&to_mcq(ibcq)->mcq, + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list)) + ret = 1; + spin_unlock_irqrestore(&cq->lock, irq_flags); + + mlx5_cq_arm(&cq->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, uar_page, MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), to_mcq(ibcq)->mcq.cons_index); - return 0; + return ret; } static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, @@ -648,11 +713,9 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, int err; err = mlx5_buf_alloc(dev->mdev, nent * cqe_size, - PAGE_SIZE * 2, &buf->buf); - if (err) { - mlx5_ib_err(dev, "alloc failed\n"); + 2 * PAGE_SIZE, &buf->buf); + if (err) return err; - } buf->cqe_size = cqe_size; buf->nent = nent; @@ -662,38 +725,32 @@ static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf, static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct ib_ucontext *context, struct mlx5_ib_cq *cq, - int entries, struct mlx5_create_cq_mbox_in **cqb, + int entries, u32 **cqb, int *cqe_size, int *index, int *inlen) { - struct mlx5_exp_ib_create_cq ucmd; + struct mlx5_ib_create_cq ucmd; size_t ucmdlen; int page_shift; + __be64 *pas; int npages; int ncont; + void *cqc; int err; - memset(&ucmd, 0, sizeof(ucmd)); - ucmdlen = (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < - sizeof(struct mlx5_ib_create_cq)) ? - (sizeof(struct mlx5_ib_create_cq) - sizeof(ucmd.reserved)) : - sizeof(struct mlx5_ib_create_cq); + sizeof(ucmd)) ? (sizeof(ucmd) - + sizeof(ucmd.reserved)) : sizeof(ucmd); - if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { - mlx5_ib_err(dev, "copy failed\n"); + if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) return -EFAULT; - } - if (ucmdlen == sizeof(ucmd) && ucmd.reserved != 0) { - mlx5_ib_err(dev, "command corrupted\n"); + if (ucmdlen == sizeof(ucmd) && + ucmd.reserved != 0) return -EINVAL; - } - if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) { - mlx5_ib_warn(dev, "wrong CQE size %d\n", ucmd.cqe_size); + if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) return -EINVAL; - } *cqe_size = ucmd.cqe_size; @@ -707,42 +764,31 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr, &cq->db); - if (err) { - mlx5_ib_warn(dev, "map failed\n"); + if (err) goto err_umem; - } mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift, &ncont, NULL); mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n", - (unsigned long long)ucmd.buf_addr, entries * ucmd.cqe_size, - npages, page_shift, ncont); + (long long)ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont); - *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont; + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont; *cqb = mlx5_vzalloc(*inlen); if (!*cqb) { err = -ENOMEM; goto err_db; } - mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); - (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); *index = to_mucontext(context)->uuari.uars[0].index; - if (*cqe_size == 64 && MLX5_CAP_GEN(dev->mdev, cqe_compression)) { - if (ucmd.exp_data.cqe_comp_en == 1 && - (ucmd.exp_data.comp_mask & MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_EN)) { - MLX5_SET(cqc, &(*cqb)->ctx, cqe_compression_en, 1); - if (ucmd.exp_data.cqe_comp_recv_type == - MLX5_IB_CQE_FORMAT_CSUM && - (ucmd.exp_data.comp_mask & - MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_RECV_TYPE)) - MLX5_SET(cqc, &(*cqb)->ctx, mini_cqe_res_format, - MLX5_IB_CQE_FORMAT_CSUM); - } - } - return 0; err_db: @@ -755,7 +801,6 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context) { - mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db); ib_umem_release(cq->buf.umem); } @@ -775,9 +820,10 @@ static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf) static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, int entries, int cqe_size, - struct mlx5_create_cq_mbox_in **cqb, - int *index, int *inlen) + u32 **cqb, int *index, int *inlen) { + __be64 *pas; + void *cqc; int err; err = mlx5_db_alloc(dev->mdev, &cq->db); @@ -794,15 +840,21 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, init_cq_buf(cq, &cq->buf); - *inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages; + *inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * cq->buf.buf.npages; *cqb = mlx5_vzalloc(*inlen); if (!*cqb) { err = -ENOMEM; goto err_buf; } - mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas); + mlx5_fill_page_array(&cq->buf.buf, pas); + + cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context); + MLX5_SET(cqc, cqc, log_page_size, + cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + *index = dev->mdev->priv.uuari.uars[0].index; return 0; @@ -821,32 +873,42 @@ static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq) mlx5_db_free(dev->mdev, &cq->db); } +static void notify_soft_wc_handler(struct work_struct *work) +{ + struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq, + notify_work); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, - struct ib_cq_init_attr *attr, + const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata) { - struct mlx5_create_cq_mbox_in *cqb = NULL; - struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_cq *cq; int entries = attr->cqe; int vector = attr->comp_vector; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_cq *cq; int uninitialized_var(index); int uninitialized_var(inlen); + u32 *cqb = NULL; + void *cqc; int cqe_size; - int irqn; + unsigned int irqn; int eqn; int err; - if (entries < 0 || roundup_pow_of_two(entries + 1) > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { - mlx5_ib_warn(dev, "wrong entries number %d(%ld), max %d\n", - entries, roundup_pow_of_two(entries + 1), - 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); + if (entries < 0 || + (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) return ERR_PTR(-EINVAL); - } + + if (check_cq_create_flags(attr->flags)) + return ERR_PTR(-EOPNOTSUPP); entries = roundup_pow_of_two(entries + 1); + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) + return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); if (!cq) @@ -857,7 +919,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, spin_lock_init(&cq->lock); cq->resize_buf = NULL; cq->resize_umem = NULL; - + cq->create_flags = attr->flags; INIT_LIST_HEAD(&cq->list_send_qp); INIT_LIST_HEAD(&cq->list_recv_qp); @@ -867,24 +929,32 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, if (err) goto err_create; } else { - cqe_size = (cache_line_size() >= 128 ? 128 : 64); + cqe_size = cache_line_size() == 128 ? 128 : 64; err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb, &index, &inlen); if (err) goto err_create; + + INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } - cq->cqe_size = cqe_size; - cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; - cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); if (err) goto err_cqb; - cqb->ctx.c_eqn = cpu_to_be16(eqn); - cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma); + cq->cqe_size = cqe_size; - err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); + cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context); + MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + MLX5_SET(cqc, cqc, uar_page, index); + MLX5_SET(cqc, cqc, c_eqn, eqn); + MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); + if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) + MLX5_SET(cqc, cqc, oi, 1); + + err = mlx5_core_create_cq(dev->mdev, &cq->mcq, + (struct mlx5_create_cq_mbox_in *)cqb, inlen); if (err) goto err_cqb; @@ -893,6 +963,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; + INIT_LIST_HEAD(&cq->wc_list); + if (context) if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) { err = -EFAULT; @@ -1006,44 +1078,17 @@ void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq) spin_unlock_irq(&cq->lock); } -int mlx5_ib_modify_cq(struct ib_cq *cq, struct ib_cq_attr *attr, int cq_attr_mask) +int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) { - struct mlx5_modify_cq_mbox_in *in; struct mlx5_ib_dev *dev = to_mdev(cq->device); struct mlx5_ib_cq *mcq = to_mcq(cq); - u16 cq_count = attr->moderation.cq_count; - u16 cq_period = attr->moderation.cq_period; - int err; - u32 fsel = 0; - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) - return -ENOMEM; + if (!MLX5_CAP_GEN(dev->mdev, cq_moderation)) + return -ENOSYS; - in->cqn = cpu_to_be32(mcq->mcq.cqn); - if (cq_attr_mask & IB_CQ_MODERATION) { - if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) { - fsel |= (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT); - if (cq_period & 0xf000) { - /* A value higher than 0xfff is required, better - * use the largest value possible. */ - cq_period = 0xfff; - printf("mlx5_ib: INFO: ""period supported is limited to 12 bits\n"); - } - - in->ctx.cq_period = cpu_to_be16(cq_period); - in->ctx.cq_max_count = cpu_to_be16(cq_count); - } else { - err = -ENOSYS; - goto out; - } - } - in->field_select = cpu_to_be32(fsel); - err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in)); - -out: - kfree(in); + err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq, + cq_period, cq_count); if (err) mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn); @@ -1163,7 +1208,8 @@ static int copy_resize_cqes(struct mlx5_ib_cq *cq) } if (scqe == start_cqe) { - printf("mlx5_ib: WARN: ""resize CQ failed to get resize CQE, CQN 0x%x\n", cq->mcq.cqn); + pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n", + cq->mcq.cqn); return -ENOMEM; } } @@ -1175,28 +1221,32 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibcq->device); struct mlx5_ib_cq *cq = to_mcq(ibcq); - struct mlx5_modify_cq_mbox_in *in; + void *cqc; + u32 *in; int err; int npas; + __be64 *pas; int page_shift; int inlen; int uninitialized_var(cqe_size); unsigned long flags; if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) { - mlx5_ib_warn(dev, "Firmware does not support resize CQ\n"); + pr_info("Firmware does not support resize CQ\n"); return -ENOSYS; } - if (entries < 1 || roundup_pow_of_two(entries + 1) > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { - mlx5_ib_warn(dev, "wrong entries number %d(%ld), max %d\n", - entries, roundup_pow_of_two(entries + 1), + if (entries < 1 || + entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d, max %d\n", + entries, 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); return -EINVAL; } entries = roundup_pow_of_two(entries + 1); + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) + return -EINVAL; if (entries == ibcq->cqe + 1) return 0; @@ -1214,39 +1264,45 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) } } - if (err) { - mlx5_ib_warn(dev, "resize failed: %d\n", err); + if (err) goto ex; - } - inlen = sizeof(*in) + npas * sizeof(in->pas[0]); + inlen = MLX5_ST_SZ_BYTES(modify_cq_in) + + MLX5_FLD_SZ_BYTES(modify_cq_in, pas[0]) * npas; + in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto ex_resize; } + pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas); if (udata) mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift, - in->pas, 0); + pas, 0); else - mlx5_fill_page_array(&cq->resize_buf->buf, in->pas); + mlx5_fill_page_array(&cq->resize_buf->buf, pas); - in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE | - MLX5_MODIFY_CQ_MASK_PG_OFFSET | - MLX5_MODIFY_CQ_MASK_PG_SIZE); - in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; - in->ctx.page_offset = 0; - in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24); - in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE); - in->cqn = cpu_to_be32(cq->mcq.cqn); + MLX5_SET(modify_cq_in, in, + modify_field_select_resize_field_select.resize_field_select.resize_field_select, + MLX5_MODIFY_CQ_MASK_LOG_SIZE | + MLX5_MODIFY_CQ_MASK_PG_OFFSET | + MLX5_MODIFY_CQ_MASK_PG_SIZE); - err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen); - if (err) { - mlx5_ib_warn(dev, "modify cq failed: %d\n", err); + cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context); + + MLX5_SET(cqc, cqc, log_page_size, + page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size)); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries)); + + MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE); + MLX5_SET(modify_cq_in, in, cqn, cq->mcq.cqn); + + err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, + (struct mlx5_modify_cq_mbox_in *)in, inlen); + if (err) goto ex_alloc; - } if (udata) { cq->ibcq.cqe = entries - 1; @@ -1301,3 +1357,27 @@ int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) cq = to_mcq(ibcq); return cq->cqe_size; } + +/* Called from atomic context */ +int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc) +{ + struct mlx5_ib_wc *soft_wc; + struct mlx5_ib_cq *cq = to_mcq(ibcq); + unsigned long flags; + + soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC); + if (!soft_wc) + return -ENOMEM; + + soft_wc->wc = *wc; + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&soft_wc->list, &cq->wc_list); + if (cq->notify_flags == IB_CQ_NEXT_COMP || + wc->status != IB_WC_SUCCESS) { + cq->notify_flags = 0; + schedule_work(&cq->notify_work); + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c index ecbf6584c715..490363fe3bc9 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_doorbell.c @@ -34,11 +34,11 @@ struct mlx5_ib_user_db_page { struct list_head list; struct ib_umem *umem; - uintptr_t user_virt; + unsigned long user_virt; int refcnt; }; -int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, uintptr_t virt, +int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt, struct mlx5_db *db) { struct mlx5_ib_user_db_page *page; diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c new file mode 100644 index 000000000000..31d1c11a2d35 --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_gsi.c @@ -0,0 +1,536 @@ +/*- + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include "mlx5_ib.h" + +struct mlx5_ib_gsi_wr { + struct ib_cqe cqe; + struct ib_wc wc; + int send_flags; + bool completed:1; +}; + +struct mlx5_ib_gsi_qp { + struct ib_qp ibqp; + struct ib_qp *rx_qp; + u8 port_num; + struct ib_qp_cap cap; + enum ib_sig_type sq_sig_type; + /* Serialize qp state modifications */ + struct mutex mutex; + struct ib_cq *cq; + struct mlx5_ib_gsi_wr *outstanding_wrs; + u32 outstanding_pi, outstanding_ci; + int num_qps; + /* Protects access to the tx_qps. Post send operations synchronize + * with tx_qp creation in setup_qp(). Also protects the + * outstanding_wrs array and indices. + */ + spinlock_t lock; + struct ib_qp **tx_qps; +}; + +static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp) +{ + return container_of(qp, struct mlx5_ib_gsi_qp, ibqp); +} + +static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn); +} + +/* Call with gsi->lock locked */ +static void generate_completions(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_cq *gsi_cq = gsi->ibqp.send_cq; + struct mlx5_ib_gsi_wr *wr; + u32 index; + + for (index = gsi->outstanding_ci; index != gsi->outstanding_pi; + index++) { + wr = &gsi->outstanding_wrs[index % gsi->cap.max_send_wr]; + + if (!wr->completed) + break; + + if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR || + wr->send_flags & IB_SEND_SIGNALED) + WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc)); + + wr->completed = false; + } + + gsi->outstanding_ci = index; +} + +static void handle_single_completion(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_gsi_qp *gsi = cq->cq_context; + struct mlx5_ib_gsi_wr *wr = + container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe); + u64 wr_id; + unsigned long flags; + + spin_lock_irqsave(&gsi->lock, flags); + wr->completed = true; + wr_id = wr->wc.wr_id; + wr->wc = *wc; + wr->wc.wr_id = wr_id; + wr->wc.qp = &gsi->ibqp; + + generate_completions(gsi); + spin_unlock_irqrestore(&gsi->lock, flags); +} + +struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_gsi_qp *gsi; + struct ib_qp_init_attr hw_init_attr = *init_attr; + const u8 port_num = init_attr->port_num; + const int num_pkeys = pd->device->attrs.max_pkeys; + const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; + int ret; + + mlx5_ib_dbg(dev, "creating GSI QP\n"); + + if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { + mlx5_ib_warn(dev, + "invalid port number %d during GSI QP creation\n", + port_num); + return ERR_PTR(-EINVAL); + } + + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); + if (!gsi) + return ERR_PTR(-ENOMEM); + + gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL); + if (!gsi->tx_qps) { + ret = -ENOMEM; + goto err_free; + } + + gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr, + sizeof(*gsi->outstanding_wrs), + GFP_KERNEL); + if (!gsi->outstanding_wrs) { + ret = -ENOMEM; + goto err_free_tx; + } + + mutex_init(&gsi->mutex); + + mutex_lock(&dev->devr.mutex); + + if (dev->devr.ports[port_num - 1].gsi) { + mlx5_ib_warn(dev, "GSI QP already exists on port %d\n", + port_num); + ret = -EBUSY; + goto err_free_wrs; + } + gsi->num_qps = num_qps; + spin_lock_init(&gsi->lock); + + gsi->cap = init_attr->cap; + gsi->sq_sig_type = init_attr->sq_sig_type; + gsi->ibqp.qp_num = 1; + gsi->port_num = port_num; + + gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0, + IB_POLL_SOFTIRQ); + if (IS_ERR(gsi->cq)) { + mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n", + PTR_ERR(gsi->cq)); + ret = PTR_ERR(gsi->cq); + goto err_free_wrs; + } + + hw_init_attr.qp_type = MLX5_IB_QPT_HW_GSI; + hw_init_attr.send_cq = gsi->cq; + if (num_qps) { + hw_init_attr.cap.max_send_wr = 0; + hw_init_attr.cap.max_send_sge = 0; + hw_init_attr.cap.max_inline_data = 0; + } + gsi->rx_qp = ib_create_qp(pd, &hw_init_attr); + if (IS_ERR(gsi->rx_qp)) { + mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n", + PTR_ERR(gsi->rx_qp)); + ret = PTR_ERR(gsi->rx_qp); + goto err_destroy_cq; + } + + dev->devr.ports[init_attr->port_num - 1].gsi = gsi; + + mutex_unlock(&dev->devr.mutex); + + return &gsi->ibqp; + +err_destroy_cq: + ib_free_cq(gsi->cq); +err_free_wrs: + mutex_unlock(&dev->devr.mutex); + kfree(gsi->outstanding_wrs); +err_free_tx: + kfree(gsi->tx_qps); +err_free: + kfree(gsi); + return ERR_PTR(ret); +} + +int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + const int port_num = gsi->port_num; + int qp_index; + int ret; + + mlx5_ib_dbg(dev, "destroying GSI QP\n"); + + mutex_lock(&dev->devr.mutex); + ret = ib_destroy_qp(gsi->rx_qp); + if (ret) { + mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n", + ret); + mutex_unlock(&dev->devr.mutex); + return ret; + } + dev->devr.ports[port_num - 1].gsi = NULL; + mutex_unlock(&dev->devr.mutex); + gsi->rx_qp = NULL; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) { + if (!gsi->tx_qps[qp_index]) + continue; + WARN_ON_ONCE(ib_destroy_qp(gsi->tx_qps[qp_index])); + gsi->tx_qps[qp_index] = NULL; + } + + ib_free_cq(gsi->cq); + + kfree(gsi->outstanding_wrs); + kfree(gsi->tx_qps); + kfree(gsi); + + return 0; +} + +static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) +{ + struct ib_pd *pd = gsi->rx_qp->pd; + struct ib_qp_init_attr init_attr = { + .event_handler = gsi->rx_qp->event_handler, + .qp_context = gsi->rx_qp->qp_context, + .send_cq = gsi->cq, + .recv_cq = gsi->rx_qp->recv_cq, + .cap = { + .max_send_wr = gsi->cap.max_send_wr, + .max_send_sge = gsi->cap.max_send_sge, + .max_inline_data = gsi->cap.max_inline_data, + }, + .sq_sig_type = gsi->sq_sig_type, + .qp_type = IB_QPT_UD, + .create_flags = mlx5_ib_create_qp_sqpn_qp1(), + }; + + return ib_create_qp(pd, &init_attr); +} + +static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, + u16 qp_index) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct ib_qp_attr attr; + int mask; + int ret; + + mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = qp_index; + attr.qkey = IB_QP1_QKEY; + attr.port_num = gsi->port_num; + ret = ib_modify_qp(qp, &attr, mask); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to INIT: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTR: %d\n", + qp->qp_num, ret); + return ret; + } + + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + mlx5_ib_err(dev, "could not change QP%d state to RTS: %d\n", + qp->qp_num, ret); + return ret; + } + + return 0; +} + +static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) +{ + struct ib_device *device = gsi->rx_qp->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_qp *qp; + unsigned long flags; + u16 pkey; + int ret; + + ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (ret) { + mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", + gsi->port_num, qp_index); + return; + } + + if (!pkey) { + mlx5_ib_dbg(dev, "invalid P_Key at port %d, index %d. Skipping.\n", + gsi->port_num, qp_index); + return; + } + + spin_lock_irqsave(&gsi->lock, flags); + qp = gsi->tx_qps[qp_index]; + spin_unlock_irqrestore(&gsi->lock, flags); + if (qp) { + mlx5_ib_dbg(dev, "already existing GSI TX QP at port %d, index %d. Skipping\n", + gsi->port_num, qp_index); + return; + } + + qp = create_gsi_ud_qp(gsi); + if (IS_ERR(qp)) { + mlx5_ib_warn(dev, "unable to create hardware UD QP for GSI: %ld\n", + PTR_ERR(qp)); + return; + } + + ret = modify_to_rts(gsi, qp, qp_index); + if (ret) + goto err_destroy_qp; + + spin_lock_irqsave(&gsi->lock, flags); + WARN_ON_ONCE(gsi->tx_qps[qp_index]); + gsi->tx_qps[qp_index] = qp; + spin_unlock_irqrestore(&gsi->lock, flags); + + return; + +err_destroy_qp: + WARN_ON_ONCE(qp); +} + +static void setup_qps(struct mlx5_ib_gsi_qp *gsi) +{ + u16 qp_index; + + for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index) + setup_qp(gsi, qp_index); +} + +int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state); + + mutex_lock(&gsi->mutex); + ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask); + if (ret) { + mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret); + goto unlock; + } + + if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS) + setup_qps(gsi); + +unlock: + mutex_unlock(&gsi->mutex); + + return ret; +} + +int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + int ret; + + mutex_lock(&gsi->mutex); + ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr); + qp_init_attr->cap = gsi->cap; + mutex_unlock(&gsi->mutex); + + return ret; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr, struct ib_wc *wc) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_gsi_wr *gsi_wr; + + if (gsi->outstanding_pi == gsi->outstanding_ci + gsi->cap.max_send_wr) { + mlx5_ib_warn(dev, "no available GSI work request.\n"); + return -ENOMEM; + } + + gsi_wr = &gsi->outstanding_wrs[gsi->outstanding_pi % + gsi->cap.max_send_wr]; + gsi->outstanding_pi++; + + if (!wc) { + memset(&gsi_wr->wc, 0, sizeof(gsi_wr->wc)); + gsi_wr->wc.pkey_index = wr->pkey_index; + gsi_wr->wc.wr_id = wr->wr.wr_id; + } else { + gsi_wr->wc = *wc; + gsi_wr->completed = true; + } + + gsi_wr->cqe.done = &handle_single_completion; + wr->wr.wr_cqe = &gsi_wr->cqe; + + return 0; +} + +/* Call with gsi->lock locked */ +static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, + struct ib_ud_wr *wr) +{ + struct ib_wc wc = { + { .wr_id = wr->wr.wr_id }, + .status = IB_WC_SUCCESS, + .opcode = IB_WC_SEND, + .qp = &gsi->ibqp, + }; + int ret; + + ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc); + if (ret) + return ret; + + generate_completions(gsi); + + return 0; +} + +/* Call with gsi->lock locked */ +static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) +{ + struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + int qp_index = wr->pkey_index; + + if (!mlx5_ib_deth_sqpn_cap(dev)) + return gsi->rx_qp; + + if (qp_index >= gsi->num_qps) + return NULL; + + return gsi->tx_qps[qp_index]; +} + +int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + struct ib_qp *tx_qp; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ib_ud_wr cur_wr = *ud_wr(wr); + + cur_wr.wr.next = NULL; + + spin_lock_irqsave(&gsi->lock, flags); + tx_qp = get_tx_qp(gsi, &cur_wr); + if (!tx_qp) { + ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr); + if (ret) + goto err; + spin_unlock_irqrestore(&gsi->lock, flags); + continue; + } + + ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL); + if (ret) + goto err; + + ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr); + if (ret) { + /* Undo the effect of adding the outstanding wr */ + gsi->outstanding_pi = (gsi->outstanding_pi - 1) % + gsi->cap.max_send_wr; + goto err; + } + spin_unlock_irqrestore(&gsi->lock, flags); + } + + return 0; + +err: + spin_unlock_irqrestore(&gsi->lock, flags); + *bad_wr = wr; + return ret; +} + +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); + + return ib_post_recv(gsi->rx_qp, wr, bad_wr); +} + +void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi) +{ + if (!gsi) + return; + + mutex_lock(&gsi->mutex); + setup_qps(gsi); + mutex_unlock(&gsi->mutex); +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c index bf0ca9a0f2e6..719cddec9c34 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mad.c @@ -25,11 +25,11 @@ * $FreeBSD$ */ +#include #include #include #include #include "mlx5_ib.h" -#include enum { MLX5_IB_VENDOR_CLASS1 = 0x9, @@ -37,8 +37,8 @@ enum { }; int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, - u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh, - void *in_mad, void *response_mad) + u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const void *in_mad, void *response_mad) { u8 op_modifier = 0; @@ -54,8 +54,8 @@ int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, } static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, struct ib_mad *out_mad) + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid; int err; @@ -106,89 +106,148 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, } static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, - struct mlx5_vport_counters *vc) + void *out) { - pma_cnt_ext->port_xmit_data = cpu_to_be64((vc->transmitted_ib_unicast.octets + - vc->transmitted_ib_multicast.octets) >> 2); - pma_cnt_ext->port_rcv_data = cpu_to_be64((vc->received_ib_unicast.octets + - vc->received_ib_multicast.octets) >> 2); - pma_cnt_ext->port_xmit_packets = cpu_to_be64(vc->transmitted_ib_unicast.packets + - vc->transmitted_ib_multicast.packets); - pma_cnt_ext->port_rcv_packets = cpu_to_be64(vc->received_ib_unicast.packets + - vc->received_ib_multicast.packets); - pma_cnt_ext->port_unicast_xmit_packets = cpu_to_be64(vc->transmitted_ib_unicast.packets); - pma_cnt_ext->port_unicast_rcv_packets = cpu_to_be64(vc->received_ib_unicast.packets); - pma_cnt_ext->port_multicast_xmit_packets = cpu_to_be64(vc->transmitted_ib_multicast.packets); - pma_cnt_ext->port_multicast_rcv_packets = cpu_to_be64(vc->received_ib_multicast.packets); +#define MLX5_SUM_CNT(p, cntr1, cntr2) \ + (MLX5_GET64(query_vport_counter_out, p, cntr1) + \ + MLX5_GET64(query_vport_counter_out, p, cntr2)) + + pma_cnt_ext->port_xmit_data = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, + transmitted_ib_multicast.octets) >> 2); + pma_cnt_ext->port_rcv_data = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, + received_ib_multicast.octets) >> 2); + pma_cnt_ext->port_xmit_packets = + cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.packets, + transmitted_ib_multicast.packets)); + pma_cnt_ext->port_rcv_packets = + cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.packets, + received_ib_multicast.packets)); + pma_cnt_ext->port_unicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_unicast.packets); + pma_cnt_ext->port_unicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_unicast.packets); + pma_cnt_ext->port_multicast_xmit_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, transmitted_ib_multicast.packets); + pma_cnt_ext->port_multicast_rcv_packets = + MLX5_GET64_BE(query_vport_counter_out, + out, received_ib_multicast.packets); } static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, - struct mlx5_vport_counters *vc) + void *out) { - ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data, - (vc->transmitted_ib_unicast.octets + - vc->transmitted_ib_multicast.octets) >> 2); - ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data, - (vc->received_ib_unicast.octets + - vc->received_ib_multicast.octets) >> 2); - ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets, - vc->transmitted_ib_unicast.packets + - vc->transmitted_ib_multicast.packets); - ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets, - vc->received_ib_unicast.packets + - vc->received_ib_multicast.packets); + /* Traffic counters will be reported in + * their 64bit form via ib_pma_portcounters_ext by default. + */ + void *out_pma = MLX5_ADDR_OF(ppcnt_reg, out, + counter_set); + +#define MLX5_ASSIGN_PMA_CNTR(counter_var, counter_name) { \ + counter_var = MLX5_GET_BE(typeof(counter_var), \ + ib_port_cntrs_grp_data_layout, \ + out_pma, counter_name); \ + } + + MLX5_ASSIGN_PMA_CNTR(pma_cnt->symbol_error_counter, + symbol_error_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_error_recovery_counter, + link_error_recovery_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_downed_counter, + link_downed_counter); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_errors, + port_rcv_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_remphys_errors, + port_rcv_remote_physical_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_switch_relay_errors, + port_rcv_switch_relay_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_discards, + port_xmit_discards); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_xmit_constraint_errors, + port_xmit_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->port_rcv_constraint_errors, + port_rcv_constraint_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->link_overrun_errors, + link_overrun_errors); + MLX5_ASSIGN_PMA_CNTR(pma_cnt->vl15_dropped, + vl_15_dropped); } static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, - struct ib_mad *in_mad, struct ib_mad *out_mad) + const struct ib_mad *in_mad, struct ib_mad *out_mad) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_vport_counters *vc; int err; - int ext; + void *out_cnt; - vc = kzalloc(sizeof(*vc), GFP_KERNEL); - if (!vc) - return -ENOMEM; + /* Decalring support of extended counters */ + if (in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO) { + struct ib_class_port_info cpi = {}; - ext = in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT; - - err = mlx5_get_vport_counters(dev->mdev, port_num, vc); - if (!err) { - if (ext) { - struct ib_pma_portcounters_ext *pma_cnt_ext = - (struct ib_pma_portcounters_ext *)(out_mad->data + 40); - - pma_cnt_ext_assign(pma_cnt_ext, vc); - } else { - struct ib_pma_portcounters *pma_cnt = - (struct ib_pma_portcounters *)(out_mad->data + 40); - - ASSIGN_16BIT_COUNTER(pma_cnt->port_rcv_errors, - (u16)vc->received_errors.packets); - - pma_cnt_assign(pma_cnt, vc); - } - err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + cpi.capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + memcpy((out_mad->data + 40), &cpi, sizeof(cpi)); + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } - kfree(vc); - return err; + if (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT) { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(query_vport_counter_out); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_vport_counter(dev->mdev, 0, 0, + port_num, out_cnt, sz); + if (!err) + pma_cnt_ext_assign(pma_cnt_ext, out_cnt); + } else { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *)(out_mad->data + 40); + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + out_cnt = mlx5_vzalloc(sz); + if (!out_cnt) + return IB_MAD_RESULT_FAILURE; + + err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num, + out_cnt, sz); + if (!err) + pma_cnt_assign(pma_cnt, out_cnt); + } + + kvfree(out_cnt); + if (err) + return IB_MAD_RESULT_FAILURE; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, struct ib_mad *out_mad) + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; memset(out_mad->data, 0, sizeof(out_mad->data)); if (MLX5_CAP_GEN(mdev, vport_counters) && in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { - /* TBD: read error counters from the PPCNT */ return process_pma_cmd(ibdev, port_num, in_mad, out_mad); } else { return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, @@ -225,7 +284,7 @@ int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) return err; } -int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev, struct ib_smp *out_mad) { struct ib_smp *in_mad = NULL; @@ -245,7 +304,7 @@ int mlx5_query_smp_attr_node_info_mad_ifc(struct ib_device *ibdev, return err; } -int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { struct ib_smp *out_mad = NULL; @@ -255,7 +314,7 @@ int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, if (!out_mad) return -ENOMEM; - err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); if (err) goto out; @@ -267,7 +326,7 @@ int mlx5_query_system_image_guid_mad_ifc(struct ib_device *ibdev, return err; } -int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_max_pkeys(struct ib_device *ibdev, u16 *max_pkeys) { struct ib_smp *out_mad = NULL; @@ -277,7 +336,7 @@ int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, if (!out_mad) return -ENOMEM; - err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); if (err) goto out; @@ -289,7 +348,7 @@ int mlx5_query_max_pkeys_mad_ifc(struct ib_device *ibdev, return err; } -int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, +int mlx5_query_mad_ifc_vendor_id(struct ib_device *ibdev, u32 *vendor_id) { struct ib_smp *out_mad = NULL; @@ -299,7 +358,7 @@ int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, if (!out_mad) return -ENOMEM; - err = mlx5_query_smp_attr_node_info_mad_ifc(ibdev, out_mad); + err = mlx5_query_mad_ifc_smp_attr_node_info(ibdev, out_mad); if (err) goto out; @@ -311,7 +370,7 @@ int mlx5_query_vendor_id_mad_ifc(struct ib_device *ibdev, return err; } -int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc) +int mlx5_query_mad_ifc_node_desc(struct mlx5_ib_dev *dev, char *node_desc) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; @@ -329,14 +388,14 @@ int mlx5_query_node_desc_mad_ifc(struct mlx5_ib_dev *dev, char *node_desc) if (err) goto out; - memcpy(node_desc, out_mad->data, 64); + memcpy(node_desc, out_mad->data, IB_DEVICE_NODE_DESC_MAX); out: kfree(in_mad); kfree(out_mad); return err; } -int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid) +int mlx5_query_mad_ifc_node_guid(struct mlx5_ib_dev *dev, __be64 *node_guid) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; @@ -354,14 +413,14 @@ int mlx5_query_node_guid_mad_ifc(struct mlx5_ib_dev *dev, u64 *node_guid) if (err) goto out; - memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); + memcpy(node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } -int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, +int mlx5_query_mad_ifc_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct ib_smp *in_mad = NULL; @@ -390,7 +449,7 @@ int mlx5_query_pkey_mad_ifc(struct ib_device *ibdev, u8 port, u16 index, return err; } -int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, +int mlx5_query_mad_ifc_gids(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct ib_smp *in_mad = NULL; @@ -430,7 +489,7 @@ int mlx5_query_gids_mad_ifc(struct ib_device *ibdev, u8 port, int index, return err; } -int mlx5_query_port_mad_ifc(struct ib_device *ibdev, u8 port, +int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c index ac1bb7a916a5..0192279aac37 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c @@ -25,35 +25,34 @@ * $FreeBSD$ */ +#include #include #include #include #include -#include +#if defined(CONFIG_X86) +#include +#endif #include -#include -#include -#include -#include -#include -#include +#include #include #undef inode - #include +#include +#include +#include +#include #include #include -#include "user.h" +#include +#include +#include #include "mlx5_ib.h" -#include -#include - #define DRIVER_NAME "mlx5_ib" -#define DRIVER_VERSION "3.2-rc1" -#define DRIVER_RELDATE "May 2016" +#define DRIVER_VERSION "3.4.1-BETA" +#define DRIVER_RELDATE "October 2017" -MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1); @@ -65,61 +64,18 @@ static int deprecated_prof_sel = 2; module_param_named(prof_sel, deprecated_prof_sel, int, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); -enum { - MLX5_STANDARD_ATOMIC_SIZE = 0x8, -}; - -struct workqueue_struct *mlx5_ib_wq; - static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; -static void get_atomic_caps(struct mlx5_ib_dev *dev, - struct ib_device_attr *props) -{ - int tmp; - u8 atomic_operations; - u8 atomic_size_qp; - u8 atomic_req_endianess; - - atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); - atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); - atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev, - atomic_req_8B_endianess_mode) || - !mlx5_host_is_le(); - - tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; - if (((atomic_operations & tmp) == tmp) - && (atomic_size_qp & 8)) { - if (atomic_req_endianess) { - props->atomic_cap = IB_ATOMIC_HCA; - } else { - props->atomic_cap = IB_ATOMIC_NONE; - } - } else { - props->atomic_cap = IB_ATOMIC_NONE; - } - - tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD; - if (((atomic_operations & tmp) == tmp) - &&(atomic_size_qp & 8)) { - if (atomic_req_endianess) - props->masked_atomic_cap = IB_ATOMIC_HCA; - else { - props->masked_atomic_cap = IB_ATOMIC_NONE; - } - } else { - props->masked_atomic_cap = IB_ATOMIC_NONE; - } -} +enum { + MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, +}; static enum rdma_link_layer -mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { - struct mlx5_ib_dev *dev = to_mdev(device); - - switch (MLX5_CAP_GEN(dev->mdev, port_type)) { + switch (port_type_cap) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: @@ -129,9 +85,237 @@ mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) } } +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); + + return mlx5_port_type_cap_to_rdma_ll(port_type_cap); +} + +static bool mlx5_netdev_match(struct net_device *ndev, + struct mlx5_core_dev *mdev, + const char *dname) +{ + return ndev->if_type == IFT_ETHER && + ndev->if_dname != NULL && + strcmp(ndev->if_dname, dname) == 0 && + ndev->if_softc != NULL && + *(struct mlx5_core_dev **)ndev->if_softc == mdev; +} + +static int mlx5_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, + roce.nb); + + switch (event) { + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + write_lock(&ibdev->roce.netdev_lock); + /* check if network interface belongs to mlx5en */ + if (mlx5_netdev_match(ndev, ibdev->mdev, "mce")) + ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? + NULL : ndev; + write_unlock(&ibdev->roce.netdev_lock); + break; + + case NETDEV_UP: + case NETDEV_DOWN: { + struct net_device *upper = NULL; + + if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) + && ibdev->ib_active) { + struct ib_event ibev = {0}; + + ibev.device = &ibdev->ib_dev; + ibev.event = (event == NETDEV_UP) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + ibev.element.port_num = 1; + ib_dispatch_event(&ibev); + } + break; + } + + default: + break; + } + + return NOTIFY_DONE; +} + +static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, + u8 port_num) +{ + struct mlx5_ib_dev *ibdev = to_mdev(device); + struct net_device *ndev; + + /* Ensure ndev does not disappear before we invoke dev_hold() + */ + read_lock(&ibdev->roce.netdev_lock); + ndev = ibdev->roce.netdev; + if (ndev) + dev_hold(ndev); + read_unlock(&ibdev->roce.netdev_lock); + + return ndev; +} + +static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct net_device *ndev; + enum ib_mtu ndev_ib_mtu; + u16 qkey_viol_cntr; + + memset(props, 0, sizeof(*props)); + + props->port_cap_flags |= IB_PORT_CM_SUP; + props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; + + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = 3; + + mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); + props->qkey_viol_cntr = qkey_viol_cntr; + + ndev = mlx5_ib_get_netdev(device, port_num); + if (!ndev) + return 0; + + if (netif_running(ndev) && netif_carrier_ok(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + ndev_ib_mtu = iboe_get_mtu(ndev->if_mtu); + + dev_put(ndev); + + props->active_mtu = min(props->max_mtu, ndev_ib_mtu); + + props->active_width = IB_WIDTH_4X; /* TODO */ + props->active_speed = IB_SPEED_QDR; /* TODO */ + + return 0; +} + +static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, + const struct ib_gid_attr *attr, + void *mlx5_addr) +{ +#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) + char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_l3_address); + void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_mac_47_32); + + if (!gid) + return; + ether_addr_copy(mlx5_addr_mac, IF_LLADDR(attr->ndev)); + + if (is_vlan_dev(attr->ndev)) { + MLX5_SET_RA(mlx5_addr, vlan_valid, 1); + MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev)); + } + + switch (attr->gid_type) { + case IB_GID_TYPE_IB: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); + break; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); + break; + + default: + WARN_ON(true); + } + + if (attr->gid_type != IB_GID_TYPE_IB) { + if (ipv6_addr_v4mapped((void *)gid)) + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV4); + else + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV6); + } + + if ((attr->gid_type == IB_GID_TYPE_IB) || + !ipv6_addr_v4mapped((void *)gid)) + memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); + else + memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); +} + +static int set_roce_addr(struct ib_device *device, u8 port_num, + unsigned int index, + const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0}; + void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); + + if (ll != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); + + MLX5_SET(set_roce_address_in, in, roce_address_index, index); + MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr, + __always_unused void **context) +{ + return set_roce_addr(device, port_num, index, gid, attr); +} + +static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, + unsigned int index, __always_unused void **context) +{ + return set_roce_addr(device, port_num, index, NULL, NULL); +} + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, + int index) +{ + struct ib_gid_attr attr; + union ib_gid gid; + + if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) + return 0; + + if (!attr.ndev) + return 0; + + dev_put(attr.ndev); + + if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return 0; + + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +} + static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { - return !dev->mdev->issi; + if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) + return !MLX5_CAP_GEN(dev->mdev, ib_virt); + return 0; } enum { @@ -152,6 +336,28 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev) return MLX5_VPORT_ACCESS_METHOD_HCA; } +static void get_atomic_caps(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 tmp; + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + u8 atomic_req_8B_endianness_mode = + MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode); + + /* Check if HW supports 8 bytes standard atomic operations and capable + * of host endianness respond + */ + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) && + (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && + (atomic_req_8B_endianness_mode)) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } +} + static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { @@ -162,24 +368,26 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev, switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_system_image_guid_mad_ifc(ibdev, + return mlx5_query_mad_ifc_system_image_guid(ibdev, sys_image_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); - if (!err) - *sys_image_guid = cpu_to_be64(tmp); - return err; + break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); - if (!err) - *sys_image_guid = cpu_to_be64(tmp); - return err; + break; default: return -EINVAL; } + + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + + return err; + } static int mlx5_query_max_pkeys(struct ib_device *ibdev, @@ -190,7 +398,7 @@ static int mlx5_query_max_pkeys(struct ib_device *ibdev, switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys); + return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: @@ -210,7 +418,7 @@ static int mlx5_query_vendor_id(struct ib_device *ibdev, switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id); + return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: @@ -229,27 +437,28 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, switch (mlx5_get_vport_access_method(&dev->ib_dev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_node_guid_mad_ifc(dev, node_guid); + return mlx5_query_mad_ifc_node_guid(dev, node_guid); case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); - if (!err) - *node_guid = cpu_to_be64(tmp); - return err; + break; case MLX5_VPORT_ACCESS_METHOD_NIC: err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); - if (!err) - *node_guid = cpu_to_be64(tmp); - return err; + break; default: return -EINVAL; } + + if (!err) + *node_guid = cpu_to_be64(tmp); + + return err; } struct mlx5_reg_node_desc { - u8 desc[64]; + u8 desc[IB_DEVICE_NODE_DESC_MAX]; }; static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) @@ -257,7 +466,7 @@ static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) struct mlx5_reg_node_desc in; if (mlx5_use_mad_ifc(dev)) - return mlx5_query_node_desc_mad_ifc(dev, node_desc); + return mlx5_query_mad_ifc_node_desc(dev, node_desc); memset(&in, 0, sizeof(in)); @@ -267,18 +476,29 @@ static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc) } static int mlx5_ib_query_device(struct ib_device *ibdev, - struct ib_device_attr *props) + struct ib_device_attr *props, + struct ib_udata *uhw) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; - int max_sq_desc; + int err = -ENOMEM; int max_rq_sg; int max_sq_sg; - int err; + u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); + struct mlx5_ib_query_device_resp resp = {}; + size_t resp_len; + u64 max_tso; + resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); + if (uhw->outlen && uhw->outlen < resp_len) + return -EINVAL; + else + resp.response_length = resp_len; + + if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) + return -EINVAL; memset(props, 0, sizeof(*props)); - err = mlx5_query_system_image_guid(ibdev, &props->sys_image_guid); if (err) @@ -293,7 +513,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, return err; props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) | - ((u64)fw_rev_min(dev->mdev) << 16) | + (fw_rev_min(dev->mdev) << 16) | fw_rev_sub(dev->mdev); props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | @@ -306,27 +526,89 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; - props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; + if (MLX5_CAP_GEN(mdev, imaicl)) { + props->device_cap_flags |= IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_WINDOW_TYPE_2B; + props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); + /* We support 'Gappy' memory registration too */ + props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; + } props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (MLX5_CAP_GEN(mdev, sho)) { + props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER; + /* At this stage no support for signature handover */ + props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | + IB_PROT_T10DIF_TYPE_2 | + IB_PROT_T10DIF_TYPE_3; + props->sig_guard_cap = IB_GUARD_T10DIF_CRC | + IB_GUARD_T10DIF_CSUM; + } if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { + if (MLX5_CAP_ETH(mdev, csum_cap)) + props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { + max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); + if (max_tso) { + resp.tso_caps.max_tso = 1 << max_tso; + resp.tso_caps.supported_qpts |= + 1 << IB_QPT_RAW_PACKET; + resp.response_length += sizeof(resp.tso_caps); + } + } + + if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { + resp.rss_caps.rx_hash_function = + MLX5_RX_HASH_FUNC_TOEPLITZ; + resp.rss_caps.rx_hash_fields_mask = + MLX5_RX_HASH_SRC_IPV4 | + MLX5_RX_HASH_DST_IPV4 | + MLX5_RX_HASH_SRC_IPV6 | + MLX5_RX_HASH_DST_IPV6 | + MLX5_RX_HASH_SRC_PORT_TCP | + MLX5_RX_HASH_DST_PORT_TCP | + MLX5_RX_HASH_SRC_PORT_UDP | + MLX5_RX_HASH_DST_PORT_UDP; + resp.response_length += sizeof(resp.rss_caps); + } + } else { + if (field_avail(typeof(resp), tso_caps, uhw->outlen)) + resp.response_length += sizeof(resp.tso_caps); + if (field_avail(typeof(resp), rss_caps, uhw->outlen)) + resp.response_length += sizeof(resp.rss_caps); + } + + if (MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) { + props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; + props->device_cap_flags |= IB_DEVICE_UD_TSO; + } + + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, scatter_fcs)) + props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; props->max_mr_size = ~0ull; - props->page_size_cap = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1); + props->page_size_cap = ~(min_page_size - 1); props->max_qp = 1 << MLX5_CAP_GEN(mdev, log_max_qp); props->max_qp_wr = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); max_rq_sg = MLX5_CAP_GEN(mdev, max_wqe_sz_rq) / sizeof(struct mlx5_wqe_data_seg); - max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512); - max_sq_sg = (max_sq_desc - - sizeof(struct mlx5_wqe_ctrl_seg) - - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); + max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) - + sizeof(struct mlx5_wqe_ctrl_seg)) / + sizeof(struct mlx5_wqe_data_seg); props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_sge_rd = MLX5_MAX_SGE_RD; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); @@ -338,14 +620,47 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->local_ca_ack_delay = MLX5_CAP_GEN(mdev, local_ca_ack_delay); props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; - props->max_fast_reg_page_list_len = (unsigned int)-1; + props->max_fast_reg_page_list_len = + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); get_atomic_caps(dev, props); + props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ - props->max_ah = INT_MAX; + props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); + props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (MLX5_CAP_GEN(mdev, pg)) + props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->odp_caps = dev->odp_caps; +#endif + + if (MLX5_CAP_GEN(mdev, cd)) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + + if (!mlx5_core_is_pf(mdev)) + props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + + if (mlx5_ib_port_link_layer(ibdev, 1) == + IB_LINK_LAYER_ETHERNET) { + props->rss_caps.max_rwq_indirection_tables = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); + props->rss_caps.max_rwq_indirection_table_size = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size); + props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET; + props->max_wq_type_rq = + 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); + } + + if (uhw->outlen) { + err = ib_copy_to_udata(uhw, &resp, resp.response_length); + + if (err) + return err; + } return 0; } @@ -367,8 +682,8 @@ static int translate_active_width(struct ib_device *ibdev, u8 active_width, if (active_width & MLX5_IB_WIDTH_1X) { *ib_width = IB_WIDTH_1X; } else if (active_width & MLX5_IB_WIDTH_2X) { - mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n", - (int)active_width); + mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n", + (int)active_width); err = -EINVAL; } else if (active_width & MLX5_IB_WIDTH_4X) { *ib_width = IB_WIDTH_4X; @@ -385,9 +700,6 @@ static int translate_active_width(struct ib_device *ibdev, u8 active_width, return err; } -/* - * TODO: Move to IB core - */ enum ib_max_vl_num { __IB_MAX_VL_0 = 1, __IB_MAX_VL_0_1 = 2, @@ -435,20 +747,20 @@ static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap, return 0; } -static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props) +static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_core_dev *mdev = dev->mdev; u32 *rep; - int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); + int replen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out); struct mlx5_ptys_reg *ptys; struct mlx5_pmtu_reg *pmtu; struct mlx5_pvlc_reg pvlc; void *ctx; int err; - rep = mlx5_vzalloc(outlen); + rep = mlx5_vzalloc(replen); ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL); if (!rep || !ptys || !pmtu) { @@ -458,8 +770,7 @@ static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, memset(props, 0, sizeof(*props)); - /* what if I am pf with dual port */ - err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen); + err = mlx5_query_hca_vport_context(mdev, port, 0, rep, replen); if (err) goto out; @@ -477,13 +788,14 @@ static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port, props->max_msg_sz = 1 << MLX5_CAP_GEN(mdev, log_max_msg); props->pkey_tbl_len = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size)); props->bad_pkey_cntr = MLX5_GET(hca_vport_context, ctx, - pkey_violation_counter); + pkey_violation_counter); props->qkey_viol_cntr = MLX5_GET(hca_vport_context, ctx, - qkey_violation_counter); + qkey_violation_counter); props->subnet_timeout = MLX5_GET(hca_vport_context, ctx, - subnet_timeout); + subnet_timeout); props->init_type_reply = MLX5_GET(hca_vport_context, ctx, - init_type_reply); + init_type_reply); + props->grh_required = MLX5_GET(hca_vport_context, ctx, grh_required); ptys->proto_mask |= MLX5_PTYS_IB; ptys->local_port = port; @@ -526,10 +838,10 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, { switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_port_mad_ifc(ibdev, port, props); + return mlx5_query_mad_ifc_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_HCA: - return mlx5_query_port_ib(ibdev, port, props); + return mlx5_query_hca_port(ibdev, port, props); case MLX5_VPORT_ACCESS_METHOD_NIC: return mlx5_query_port_roce(ibdev, port, props); @@ -539,155 +851,6 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, } } -static inline int -mlx5_addrconf_ifid_eui48(u8 *eui, struct net_device *dev) -{ - if (dev->if_addrlen != ETH_ALEN) - return -1; - memcpy(eui, IF_LLADDR(dev), 3); - memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); - - /* NOTE: The scope ID is added by the GID to IP conversion */ - - eui[3] = 0xFF; - eui[4] = 0xFE; - eui[0] ^= 2; - return 0; -} - -static void -mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid) -{ - gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - mlx5_addrconf_ifid_eui48(&gid->raw[8], dev); -} - -static void -mlx5_ib_roce_port_update(void *arg) -{ - struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg; - struct mlx5_ib_dev *dev = port->dev; - struct mlx5_core_dev *mdev = dev->mdev; - struct net_device *xdev[MLX5_IB_GID_MAX]; - struct net_device *idev; - struct net_device *ndev; - struct ifaddr *ifa; - union ib_gid gid_temp; - - while (port->port_gone == 0) { - int update = 0; - int gid_index = 0; - int j; - int error; - - ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH); - if (ndev == NULL) { - pause("W", hz); - continue; - } - - CURVNET_SET_QUIET(ndev->if_vnet); - - memset(&gid_temp, 0, sizeof(gid_temp)); - mlx5_make_default_gid(ndev, &gid_temp); - if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { - port->gid_table[gid_index] = gid_temp; - update = 1; - } - xdev[gid_index] = ndev; - gid_index++; - - IFNET_RLOCK(); - TAILQ_FOREACH(idev, &V_ifnet, if_link) { - if (idev == ndev) - break; - } - if (idev != NULL) { - TAILQ_FOREACH(idev, &V_ifnet, if_link) { - if (idev != ndev) { - if (idev->if_type != IFT_L2VLAN) - continue; - if (ndev != rdma_vlan_dev_real_dev(idev)) - continue; - } - /* clone address information for IPv4 and IPv6 */ - IF_ADDR_RLOCK(idev); - TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL || - (ifa->ifa_addr->sa_family != AF_INET && - ifa->ifa_addr->sa_family != AF_INET6) || - gid_index >= MLX5_IB_GID_MAX) - continue; - memset(&gid_temp, 0, sizeof(gid_temp)); - rdma_ip2gid(ifa->ifa_addr, &gid_temp); - /* check for existing entry */ - for (j = 0; j != gid_index; j++) { - if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0) - break; - } - /* check if new entry must be added */ - if (j == gid_index) { - if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) { - port->gid_table[gid_index] = gid_temp; - update = 1; - } - xdev[gid_index] = idev; - gid_index++; - } - } - IF_ADDR_RUNLOCK(idev); - } - } - IFNET_RUNLOCK(); - CURVNET_RESTORE(); - - if (update != 0 && - mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { - struct ib_event event = { - .device = &dev->ib_dev, - .element.port_num = port->port_num + 1, - .event = IB_EVENT_GID_CHANGE, - }; - - /* add new entries, if any */ - for (j = 0; j != gid_index; j++) { - error = modify_gid_roce(&dev->ib_dev, port->port_num, j, - port->gid_table + j, xdev[j]); - if (error != 0) - printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error); - } - memset(&gid_temp, 0, sizeof(gid_temp)); - - /* clear old entries, if any */ - for (; j != MLX5_IB_GID_MAX; j++) { - if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0) - continue; - port->gid_table[j] = gid_temp; - (void) modify_gid_roce(&dev->ib_dev, port->port_num, j, - port->gid_table + j, ndev); - } - - /* make sure ibcore gets updated */ - ib_dispatch_event(&event); - } - pause("W", hz); - } - do { - struct ib_event event = { - .device = &dev->ib_dev, - .element.port_num = port->port_num + 1, - .event = IB_EVENT_GID_CHANGE, - }; - /* make sure ibcore gets updated */ - ib_dispatch_event(&event); - - /* wait a bit */ - pause("W", hz); - } while (0); - port->port_gone = 2; - kthread_exit(); -} - static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { @@ -696,23 +859,15 @@ static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_gids_mad_ifc(ibdev, port, index, gid); + return mlx5_query_mad_ifc_gids(ibdev, port, index, gid); case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid); - case MLX5_VPORT_ACCESS_METHOD_NIC: - if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) || - index < 0 || index >= MLX5_IB_GID_MAX || - dev->port[port - 1].port_gone != 0) - memset(gid, 0, sizeof(*gid)); - else - *gid = dev->port[port - 1].gid_table[index]; - return 0; - default: return -EINVAL; } + } static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, @@ -723,13 +878,12 @@ static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: - return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey); + return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: - return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, + return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, pkey); - default: return -EINVAL; } @@ -753,13 +907,13 @@ static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ - memcpy(&in, props->node_desc, 64); + memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX); err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out, sizeof(out), MLX5_REG_NODE_DESC, 0, 1); if (err) return err; - memcpy(ibdev->node_desc, props->node_desc, 64); + memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX); return err; } @@ -767,20 +921,11 @@ static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { - u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) == - IB_LINK_LAYER_ETHERNET); struct mlx5_ib_dev *dev = to_mdev(ibdev); struct ib_port_attr attr; u32 tmp; int err; - /* return OK if this is RoCE. CM calls ib_modify_port() regardless - * of whether port link layer is ETH or IB. For ETH ports, qkey - * violations and port capabilities are not valid. - */ - if (is_eth) - return 0; - mutex_lock(&dev->cap_mask_mutex); err = mlx5_ib_query_port(ibdev, port, &attr); @@ -797,22 +942,12 @@ static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, return err; } -enum mlx5_cap_flags { - MLX5_CAP_COMPACT_AV = 1 << 0, -}; - -static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev) -{ - *flags |= MLX5_CAP_GEN(dev, compact_address_vector) ? - MLX5_CAP_COMPACT_AV : 0; -} - static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_alloc_ucontext_req_v2 req; - struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_alloc_ucontext_req_v2 req = {}; + struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; @@ -823,65 +958,65 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, int err; int i; size_t reqlen; + size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, + max_cqe_version); if (!dev->ib_active) return ERR_PTR(-EAGAIN); - memset(&req, 0, sizeof(req)); - memset(&resp, 0, sizeof(resp)); + if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr)) + return ERR_PTR(-EINVAL); reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; - else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + else if (reqlen >= min_req_v2) ver = 2; - else { - mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen); + else return ERR_PTR(-EINVAL); - } - err = ib_copy_from_udata(&req, udata, reqlen); - if (err) { - mlx5_ib_err(dev, "copy failed\n"); + err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req))); + if (err) return ERR_PTR(err); - } - if (req.reserved) { - mlx5_ib_err(dev, "request corrupted\n"); + if (req.flags) return ERR_PTR(-EINVAL); - } - if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) { - mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars); + if (req.total_num_uuars > MLX5_MAX_UUARS) return ERR_PTR(-ENOMEM); - } + + if (req.total_num_uuars == 0) + return ERR_PTR(-EINVAL); + + if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (reqlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + reqlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_NON_FP_BF_REGS_PER_PAGE); - if (req.num_low_latency_uuars > req.total_num_uuars - 1) { - mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n", - req.total_num_uuars, req.total_num_uuars); + if (req.num_low_latency_uuars > req.total_num_uuars - 1) return ERR_PTR(-EINVAL); - } num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); - resp.cache_line_size = L1_CACHE_BYTES; + resp.cache_line_size = cache_line_size(); resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); - set_mlx5_flags(&resp.flags, dev->mdev); - - if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen) - resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc); - - if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen) - resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + resp.cqe_version = min_t(__u8, + (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), + req.max_cqe_version); + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) @@ -919,41 +1054,73 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, for (i = 0; i < num_uars; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); - if (err) { - mlx5_ib_err(dev, "uar alloc failed at %d\n", i); - goto out_uars; - } + if (err) + goto out_count; } - for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) - context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; +#endif + + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { + err = mlx5_alloc_transport_domain(dev->mdev, + &context->tdn); + if (err) + goto out_uars; + } + + INIT_LIST_HEAD(&context->vma_private_list); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); - err = ib_copy_to_udata(udata, &resp, - min_t(size_t, udata->outlen, sizeof(resp))); + + if (field_avail(typeof(resp), cqe_version, udata->outlen)) + resp.response_length += sizeof(resp.cqe_version); + + if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { + resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE; + resp.response_length += sizeof(resp.cmds_supp_uhw); + } + + /* + * We don't want to expose information from the PCI bar that is located + * after 4096 bytes, so if the arch only supports larger pages, let's + * pretend we don't support reading the HCA's core clock. This is also + * forced by mmap function. + */ + if (PAGE_SIZE <= 4096 && + field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp.hca_core_clock_offset = + offsetof(struct mlx5_init_seg, internal_timer_h) % + PAGE_SIZE; + resp.response_length += sizeof(resp.hca_core_clock_offset) + + sizeof(resp.reserved2); + } + + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) - goto out_uars; + goto out_td; uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; - - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET) { - err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn); - if (err) - goto out_uars; - } + context->cqe_version = resp.cqe_version; return &context->ibucontext; +out_td: + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + mlx5_dealloc_transport_domain(dev->mdev, context->tdn); + out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); +out_count: kfree(uuari->count); out_bitmap: @@ -974,18 +1141,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_uuar_info *uuari = &context->uuari; int i; - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET) + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_dealloc_transport_domain(dev->mdev, context->tdn); for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); } - for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) { - if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX) - mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]); - } kfree(uuari->count); kfree(uuari->bitmap); @@ -1015,67 +1177,235 @@ static int get_index(unsigned long offset) return get_arg(offset); } -static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc, - struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev, +static void mlx5_ib_vma_open(struct vm_area_struct *area) +{ + /* vma_open is called when a new VMA is created on top of our VMA. This + * is done through either mremap flow or split_vma (usually due to + * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, + * as this VMA is strongly hardware related. Therefore we set the + * vm_ops of the newly created/cloned VMA to NULL, to prevent it from + * calling us again and trying to do incorrect actions. We assume that + * the original VMA size is exactly a single page, and therefore all + * "splitting" operation will not happen to it. + */ + area->vm_ops = NULL; +} + +static void mlx5_ib_vma_close(struct vm_area_struct *area) +{ + struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; + + /* It's guaranteed that all VMAs opened on a FD are closed before the + * file itself is closed, therefore no sync is needed with the regular + * closing flow. (e.g. mlx5 ib_dealloc_ucontext) + * However need a sync with accessing the vma as part of + * mlx5_ib_disassociate_ucontext. + * The close operation is usually called under mm->mmap_sem except when + * process is exiting. + * The exiting case is handled explicitly as part of + * mlx5_ib_disassociate_ucontext. + */ + mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; + + /* setting the vma context pointer to null in the mlx5_ib driver's + * private data, to protect a race condition in + * mlx5_ib_disassociate_ucontext(). + */ + mlx5_ib_vma_priv_data->vma = NULL; + list_del(&mlx5_ib_vma_priv_data->list); + kfree(mlx5_ib_vma_priv_data); +} + +static const struct vm_operations_struct mlx5_ib_vm_ops = { + .open = mlx5_ib_vma_open, + .close = mlx5_ib_vma_close +}; + +static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, + struct mlx5_ib_ucontext *ctx) +{ + struct mlx5_ib_vma_private_data *vma_prv; + struct list_head *vma_head = &ctx->vma_private_list; + + vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); + if (!vma_prv) + return -ENOMEM; + + vma_prv->vma = vma; + vma->vm_private_data = vma_prv; + vma->vm_ops = &mlx5_ib_vm_ops; + + list_add(&vma_prv->list, vma_head); + + return 0; +} + +static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + int ret; + struct vm_area_struct *vma; + struct mlx5_ib_vma_private_data *vma_private, *n; + struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); + if (!owning_process) + return; + + owning_mm = get_task_mm(owning_process); + if (!owning_mm) { + pr_info("no mm, disassociate ucontext is pending task termination\n"); + while (1) { + put_task_struct(owning_process); + usleep_range(1000, 2000); + owning_process = get_pid_task(ibcontext->tgid, + PIDTYPE_PID); + if (!owning_process /* || + owning_process->state == TASK_DEAD */) { + pr_info("disassociate ucontext done, task was terminated\n"); + /* in case task was dead need to release the + * task struct. + */ + if (owning_process) + put_task_struct(owning_process); + return; + } + } + } + + /* need to protect from a race on closing the vma as part of + * mlx5_ib_vma_close. + */ + down_read(&owning_mm->mmap_sem); + list_for_each_entry_safe(vma_private, n, &context->vma_private_list, + list) { + vma = vma_private->vma; + ret = zap_vma_ptes(vma, vma->vm_start, + PAGE_SIZE); + WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + /* context going to be destroyed, should + * not access ops any more. + */ + vma->vm_ops = NULL; + list_del(&vma_private->list); + kfree(vma_private); + } + up_read(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +} + +static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) +{ + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + return "WC"; + case MLX5_IB_MMAP_REGULAR_PAGE: + return "best effort WC"; + case MLX5_IB_MMAP_NC_PAGE: + return "NC"; + default: + return NULL; + } +} + +static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, + struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { + struct mlx5_uuar_info *uuari = &context->uuari; + int err; unsigned long idx; - phys_addr_t pfn; + phys_addr_t pfn, pa; + pgprot_t prot; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) { - mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n", - (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start)); + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: +/* Some architectures don't support WC memory */ +#if defined(CONFIG_X86) + if (!pat_enabled()) + return -EPERM; +#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) + return -EPERM; +#endif + /* fall through */ + case MLX5_IB_MMAP_REGULAR_PAGE: + /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ + prot = pgprot_writecombine(vma->vm_page_prot); + break; + case MLX5_IB_MMAP_NC_PAGE: + prot = pgprot_noncached(vma->vm_page_prot); + break; + default: return -EINVAL; } + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + idx = get_index(vma->vm_pgoff); - if (idx >= uuari->num_uars) { - mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n", - idx, uuari->num_uars); + if (idx >= uuari->num_uars) return -EINVAL; - } pfn = uar_index2pfn(dev, uuari->uars[idx].index); - mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, - (unsigned long long)pfn); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); vma->vm_page_prot = prot; - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) { - mlx5_ib_err(dev, "io remap failed\n"); + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", + err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); return -EAGAIN; } - mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC", - (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); + pa = pfn << PAGE_SHIFT; + mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), + vma->vm_start, &pa); - return 0; + return mlx5_ib_set_vma_data(vma, context); } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); - struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; + phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { - case MLX5_IB_MMAP_REGULAR_PAGE: - return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), - true, - uuari, dev, context); - - break; - case MLX5_IB_MMAP_WC_PAGE: - return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot), - true, uuari, dev, context); - break; - case MLX5_IB_MMAP_NC_PAGE: - return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot), - false, uuari, dev, context); + case MLX5_IB_MMAP_REGULAR_PAGE: + return uar_mmap(dev, command, vma, context); + + case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: + return -ENOSYS; + + case MLX5_IB_MMAP_CORE_CLOCK: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + /* Don't expose to user-space information it shouldn't have */ + if (PAGE_SIZE > 4096) + return -EOPNOTSUPP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + pfn = (dev->mdev->iseg_base + + offsetof(struct mlx5_init_seg, internal_timer_h)) >> + PAGE_SHIFT; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); break; default: @@ -1085,58 +1415,10 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm return 0; } -static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) -{ - struct mlx5_create_mkey_mbox_in *in; - struct mlx5_mkey_seg *seg; - struct mlx5_core_mr mr; - int err; - - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) - return -ENOMEM; - - seg = &in->seg; - seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA; - seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64); - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - seg->start_addr = 0; - - err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in), - NULL, NULL, NULL); - if (err) { - mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); - goto err_in; - } - - kfree(in); - *key = mr.key; - - return 0; - -err_in: - kfree(in); - - return err; -} - -static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key) -{ - struct mlx5_core_mr mr; - int err; - - memset(&mr, 0, sizeof(mr)); - mr.key = key; - err = mlx5_core_destroy_mkey(dev->mdev, &mr); - if (err) - mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key); -} - static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { - struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; @@ -1147,7 +1429,6 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); if (err) { - mlx5_ib_warn(dev, "pd alloc failed\n"); kfree(pd); return ERR_PTR(err); } @@ -1155,19 +1436,10 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { - mlx5_ib_err(dev, "copy failed\n"); mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } - } else { - err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn); - if (err) { - mlx5_ib_err(dev, "alloc mkey failed\n"); - mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); - kfree(pd); - return ERR_PTR(err); - } } return &pd->ibpd; @@ -1178,24 +1450,690 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd) struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); - if (!pd->uobject) - free_pa_mkey(mdev, mpd->pa_lkey); - mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); kfree(mpd); return 0; } +enum { + MATCH_CRITERIA_ENABLE_OUTER_BIT, + MATCH_CRITERIA_ENABLE_MISC_BIT, + MATCH_CRITERIA_ENABLE_INNER_BIT +}; + +#define HEADER_IS_ZERO(match_criteria, headers) \ + !(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \ + 0, MLX5_FLD_SZ_BYTES(fte_match_param, headers))) \ + +static u8 get_match_criteria_enable(u32 *match_criteria) +{ + u8 match_criteria_enable; + + match_criteria_enable = + (!HEADER_IS_ZERO(match_criteria, outer_headers)) << + MATCH_CRITERIA_ENABLE_OUTER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters)) << + MATCH_CRITERIA_ENABLE_MISC_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, inner_headers)) << + MATCH_CRITERIA_ENABLE_INNER_BIT; + + return match_criteria_enable; +} + +static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val) +{ + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val); +} + +static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) +{ + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val); + MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2); + MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); +} + +#define LAST_ETH_FIELD vlan_tag +#define LAST_IB_FIELD sl +#define LAST_IPV4_FIELD tos +#define LAST_IPV6_FIELD traffic_class +#define LAST_TCP_UDP_FIELD src_port + +/* Field is the last supported field */ +#define FIELDS_NOT_SUPPORTED(filter, field)\ + memchr_inv((void *)&filter.field +\ + sizeof(filter.field), 0,\ + sizeof(filter) -\ + offsetof(typeof(filter), field) -\ + sizeof(filter.field)) + +static int parse_flow_attr(u32 *match_c, u32 *match_v, + const union ib_flow_spec *ib_spec) +{ + void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters); + + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) + return -ENOTSUPP; + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dmac_47_16), + ib_spec->eth.mask.dst_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dmac_47_16), + ib_spec->eth.val.dst_mac); + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + smac_47_16), + ib_spec->eth.mask.src_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + smac_47_16), + ib_spec->eth.val.src_mac); + + if (ib_spec->eth.mask.vlan_tag) { + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + cvlan_tag, 1); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_vid, ntohs(ib_spec->eth.val.vlan_tag)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_cfi, + ntohs(ib_spec->eth.mask.vlan_tag) >> 12); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_cfi, + ntohs(ib_spec->eth.val.vlan_tag) >> 12); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_prio, + ntohs(ib_spec->eth.mask.vlan_tag) >> 13); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_prio, + ntohs(ib_spec->eth.val.vlan_tag) >> 13); + } + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, ntohs(ib_spec->eth.mask.ether_type)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ntohs(ib_spec->eth.val.ether_type)); + break; + case IB_FLOW_SPEC_IPV4: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) + return -ENOTSUPP; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ETH_P_IP); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.src_ip, + sizeof(ib_spec->ipv4.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.src_ip, + sizeof(ib_spec->ipv4.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.dst_ip, + sizeof(ib_spec->ipv4.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.dst_ip, + sizeof(ib_spec->ipv4.val.dst_ip)); + + set_tos(outer_headers_c, outer_headers_v, + ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos); + + set_proto(outer_headers_c, outer_headers_v, + ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto); + break; + case IB_FLOW_SPEC_IPV6: + if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) + return -ENOTSUPP; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, IPPROTO_IPV6); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.src_ip, + sizeof(ib_spec->ipv6.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + src_ipv4_src_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.src_ip, + sizeof(ib_spec->ipv6.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.mask.dst_ip, + sizeof(ib_spec->ipv6.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dst_ipv4_dst_ipv6.ipv6_layout.ipv6), + &ib_spec->ipv6.val.dst_ip, + sizeof(ib_spec->ipv6.val.dst_ip)); + + set_tos(outer_headers_c, outer_headers_v, + ib_spec->ipv6.mask.traffic_class, + ib_spec->ipv6.val.traffic_class); + + set_proto(outer_headers_c, outer_headers_v, + ib_spec->ipv6.mask.next_hdr, + ib_spec->ipv6.val.next_hdr); + + MLX5_SET(fte_match_set_misc, misc_params_c, + outer_ipv6_flow_label, + ntohl(ib_spec->ipv6.mask.flow_label)); + MLX5_SET(fte_match_set_misc, misc_params_v, + outer_ipv6_flow_label, + ntohl(ib_spec->ipv6.val.flow_label)); + break; + case IB_FLOW_SPEC_TCP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, + LAST_TCP_UDP_FIELD)) + return -ENOTSUPP; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, + IPPROTO_TCP); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + case IB_FLOW_SPEC_UDP: + if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, + LAST_TCP_UDP_FIELD)) + return -ENOTSUPP; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, + IPPROTO_UDP); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + default: + return -EINVAL; + } + + return 0; +} + +/* If a flow could catch both multicast and unicast packets, + * it won't fall into the multicast flow steering table and this rule + * could steal other multicast packets. + */ +static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) +{ + struct ib_flow_spec_eth *eth_spec; + + if (ib_attr->type != IB_FLOW_ATTR_NORMAL || + ib_attr->size < sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_eth) || + ib_attr->num_of_specs < 1) + return false; + + eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); + if (eth_spec->type != IB_FLOW_SPEC_ETH || + eth_spec->size != sizeof(*eth_spec)) + return false; + + return is_multicast_ether_addr(eth_spec->mask.dst_mac) && + is_multicast_ether_addr(eth_spec->val.dst_mac); +} + +static bool is_valid_attr(const struct ib_flow_attr *flow_attr) +{ + union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); + bool has_ipv4_spec = false; + bool eth_type_ipv4 = true; + unsigned int spec_index; + + /* Validate that ethertype is correct */ + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + if (ib_spec->type == IB_FLOW_SPEC_ETH && + ib_spec->eth.mask.ether_type) { + if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) && + ib_spec->eth.val.ether_type == htons(ETH_P_IP))) + eth_type_ipv4 = false; + } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) { + has_ipv4_spec = true; + } + ib_spec = (void *)ib_spec + ib_spec->size; + } + return !has_ipv4_spec || eth_type_ipv4; +} + +static void put_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *prio, bool ft_added) +{ + prio->refcount -= !!ft_added; + if (!prio->refcount) { + mlx5_destroy_flow_table(prio->flow_table); + prio->flow_table = NULL; + } +} + +static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) +{ + struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); + struct mlx5_ib_flow_handler *handler = container_of(flow_id, + struct mlx5_ib_flow_handler, + ibflow); + struct mlx5_ib_flow_handler *iter, *tmp; + + mutex_lock(&dev->flow_db.lock); + + list_for_each_entry_safe(iter, tmp, &handler->list, list) { + mlx5_del_flow_rule(iter->rule); + put_flow_table(dev, iter->prio, true); + list_del(&iter->list); + kfree(iter); + } + + mlx5_del_flow_rule(handler->rule); + put_flow_table(dev, handler->prio, true); + mutex_unlock(&dev->flow_db.lock); + + kfree(handler); + + return 0; +} + +static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap) +{ + priority *= 2; + if (!dont_trap) + priority++; + return priority; +} + +enum flow_table_type { + MLX5_IB_FT_RX, + MLX5_IB_FT_TX +}; + +#define MLX5_FS_MAX_TYPES 10 +#define MLX5_FS_MAX_ENTRIES 32000UL +static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, + struct ib_flow_attr *flow_attr, + enum flow_table_type ft_type) +{ + bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP; + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_table *ft; + int num_entries; + int num_groups; + int priority; + int err = 0; + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + if (flow_is_multicast_only(flow_attr) && + !dont_trap) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = ib_prio_to_core_prio(flow_attr->priority, + dont_trap); + ns = mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_BYPASS); + num_entries = MLX5_FS_MAX_ENTRIES; + num_groups = MLX5_FS_MAX_TYPES; + prio = &dev->flow_db.prios[priority]; + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + ns = mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_LEFTOVERS); + build_leftovers_ft_param("bypass", &priority, + &num_entries, + &num_groups); + prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; + } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + if (!MLX5_CAP_FLOWTABLE(dev->mdev, + allow_sniffer_and_nic_rx_shared_tir)) + return ERR_PTR(-ENOTSUPP); + + ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ? + MLX5_FLOW_NAMESPACE_SNIFFER_RX : + MLX5_FLOW_NAMESPACE_SNIFFER_TX); + + prio = &dev->flow_db.sniffer[ft_type]; + priority = 0; + num_entries = 1; + num_groups = 1; + } + + if (!ns) + return ERR_PTR(-ENOTSUPP); + + ft = prio->flow_table; + if (!ft) { + ft = mlx5_create_auto_grouped_flow_table(ns, priority, "bypass", + num_entries, + num_groups); + + if (!IS_ERR(ft)) { + prio->refcount = 0; + prio->flow_table = ft; + } else { + err = PTR_ERR(ft); + } + } + + return err ? ERR_PTR(err) : prio; +} + +static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_flow_table *ft = ft_prio->flow_table; + struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_spec *spec; + const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); + unsigned int spec_index; + u32 action; + int err = 0; + + if (!is_valid_attr(flow_attr)) + return ERR_PTR(-EINVAL); + + spec = mlx5_vzalloc(sizeof(*spec)); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !spec) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + err = parse_flow_attr(spec->match_criteria, + spec->match_value, ib_flow); + if (err < 0) + goto free; + + ib_flow += ((union ib_flow_spec *)ib_flow)->size; + } + + spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); + action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; + handler->rule = mlx5_add_flow_rule(ft, spec->match_criteria_enable, + spec->match_criteria, + spec->match_value, + action, + MLX5_FS_DEFAULT_FLOW_TAG, + dst); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + ft_prio->refcount++; + handler->prio = ft_prio; + + ft_prio->flow_table = ft; +free: + if (err) + kfree(handler); + kvfree(spec); + return err ? ERR_PTR(err) : handler; +} + +static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_dst = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + handler = create_flow_rule(dev, ft_prio, flow_attr, NULL); + if (!IS_ERR(handler)) { + handler_dst = create_flow_rule(dev, ft_prio, + flow_attr, dst); + if (IS_ERR(handler_dst)) { + mlx5_del_flow_rule(handler->rule); + ft_prio->refcount--; + kfree(handler); + handler = handler_dst; + } else { + list_add(&handler_dst->list, &handler->list); + } + } + + return handler; +} +enum { + LEFTOVERS_MC, + LEFTOVERS_UC, +}; + +static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_ucast = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + static struct { + struct ib_flow_attr flow_attr; + struct ib_flow_spec_eth eth_flow; + } leftovers_specs[] = { + [LEFTOVERS_MC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {0x1} } + } + }, + [LEFTOVERS_UC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {} } + } + } + }; + + handler = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_MC].flow_attr, + dst); + if (!IS_ERR(handler) && + flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { + handler_ucast = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_UC].flow_attr, + dst); + if (IS_ERR(handler_ucast)) { + mlx5_del_flow_rule(handler->rule); + ft_prio->refcount--; + kfree(handler); + handler = handler_ucast; + } else { + list_add(&handler_ucast->list, &handler->list); + } + } + + return handler; +} + +static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_rx, + struct mlx5_ib_flow_prio *ft_tx, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_rx; + struct mlx5_ib_flow_handler *handler_tx; + int err; + static const struct ib_flow_attr flow_attr = { + .num_of_specs = 0, + .size = sizeof(flow_attr) + }; + + handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst); + if (IS_ERR(handler_rx)) { + err = PTR_ERR(handler_rx); + goto err; + } + + handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst); + if (IS_ERR(handler_tx)) { + err = PTR_ERR(handler_tx); + goto err_tx; + } + + list_add(&handler_tx->list, &handler_rx->list); + + return handler_rx; + +err_tx: + mlx5_del_flow_rule(handler_rx->rule); + ft_rx->refcount--; + kfree(handler_rx); +err: + return ERR_PTR(err); +} + +static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_flow_handler *handler = NULL; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_ib_flow_prio *ft_prio_tx = NULL; + struct mlx5_ib_flow_prio *ft_prio; + int err; + + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) + return ERR_PTR(-ENOSPC); + + if (domain != IB_FLOW_DOMAIN_USER || + flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || + (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) + return ERR_PTR(-EINVAL); + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + mutex_lock(&dev->flow_db.lock); + + ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX); + if (IS_ERR(ft_prio_tx)) { + err = PTR_ERR(ft_prio_tx); + ft_prio_tx = NULL; + goto destroy_ft; + } + } + + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + if (mqp->flags & MLX5_IB_QP_RSS) + dst->tir_num = mqp->rss_qp.tirn; + else + dst->tir_num = mqp->raw_packet_qp.rq.tirn; + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) { + handler = create_dont_trap_rule(dev, ft_prio, + flow_attr, dst); + } else { + handler = create_flow_rule(dev, ft_prio, flow_attr, + dst); + } + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + handler = create_leftovers_rule(dev, ft_prio, flow_attr, + dst); + } else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) { + handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst); + } else { + err = -EINVAL; + goto destroy_ft; + } + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + handler = NULL; + goto destroy_ft; + } + + mutex_unlock(&dev->flow_db.lock); + kfree(dst); + + return &handler->ibflow; + +destroy_ft: + put_flow_table(dev, ft_prio, false); + if (ft_prio_tx) + put_flow_table(dev, ft_prio_tx, false); +unlock: + mutex_unlock(&dev->flow_db.lock); + kfree(dst); + kfree(handler); + return ERR_PTR(err); +} + static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; - if (ibqp->qp_type == IB_QPT_RAW_PACKET) - err = -EOPNOTSUPP; - else - err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); @@ -1208,10 +2146,7 @@ static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; - if (ibqp->qp_type == IB_QPT_RAW_PACKET) - err = -EOPNOTSUPP; - else - err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); @@ -1256,21 +2191,12 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct mlx5_ib_dev *dev = - container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), - fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); -} - static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision); + return sprintf(buf, "%x\n", dev->mdev->pdev->revision); } static ssize_t show_board(struct device *device, struct device_attribute *attr, @@ -1283,7 +2209,6 @@ static ssize_t show_board(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); @@ -1291,13 +2216,23 @@ static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); static struct device_attribute *mlx5_class_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_fw_pages, &dev_attr_reg_pages, }; +static void pkey_change_handler(struct work_struct *work) +{ + struct mlx5_ib_port_resources *ports = + container_of(work, struct mlx5_ib_port_resources, + pkey_change_work); + + mutex_lock(&ports->devr->mutex); + mlx5_ib_gsi_pkey_change(ports->gsi); + mutex_unlock(&ports->devr->mutex); +} + static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) { struct mlx5_ib_qp *mqp; @@ -1308,7 +2243,6 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) unsigned long flags_cq; unsigned long flags; - mlx5_ib_warn(ibdev, " started\n"); INIT_LIST_HEAD(&cq_armed_list); /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ @@ -1356,8 +2290,6 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) mcq->comp(mcq); } spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); - mlx5_ib_warn(ibdev, " ended\n"); - return; } static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, @@ -1365,25 +2297,30 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, { struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; struct ib_event ibev; - + bool fatal = false; u8 port = 0; switch (event) { case MLX5_DEV_EVENT_SYS_ERROR: - ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); + fatal = true; break; case MLX5_DEV_EVENT_PORT_UP: - ibev.event = IB_EVENT_PORT_ACTIVE; - port = (u8)param; - break; - case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: - ibev.event = IB_EVENT_PORT_ERR; port = (u8)param; + + /* In RoCE, port up/down events are handled in + * mlx5_netdev_event(). + */ + if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET) + return; + + ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; break; case MLX5_DEV_EVENT_LID_CHANGE: @@ -1394,6 +2331,8 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; port = (u8)param; + + schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: @@ -1413,14 +2352,16 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, ibev.device = &ibdev->ib_dev; ibev.element.port_num = port; - if ((event != MLX5_DEV_EVENT_SYS_ERROR) && - (port < 1 || port > ibdev->num_ports)) { + if (port < 1 || port > ibdev->num_ports) { mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); return; } if (ibdev->ib_active) ib_dispatch_event(&ibev); + + if (fatal) + ibdev->ib_active = false; } static void get_ext_port_caps(struct mlx5_ib_dev *dev) @@ -1431,32 +2372,13 @@ static void get_ext_port_caps(struct mlx5_ib_dev *dev) mlx5_query_ext_port_caps(dev, port); } -static void config_atomic_responder(struct mlx5_ib_dev *dev, - struct ib_device_attr *props) -{ - enum ib_atomic_cap cap = props->atomic_cap; - -#if 0 - if (cap == IB_ATOMIC_HCA || - cap == IB_ATOMIC_GLOB) -#endif - dev->enable_atomic_resp = 1; - - dev->atomic_cap = cap; -} - -enum mlx5_addr_align { - MLX5_ADDR_ALIGN_0 = 0, - MLX5_ADDR_ALIGN_64 = 64, - MLX5_ADDR_ALIGN_128 = 128, -}; - static int get_port_caps(struct mlx5_ib_dev *dev) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; int port; + struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); if (!pprops) @@ -1466,12 +2388,11 @@ static int get_port_caps(struct mlx5_ib_dev *dev) if (!dprops) goto out; - err = mlx5_ib_query_device(&dev->ib_dev, dprops); + err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); goto out; } - config_atomic_responder(dev, dprops); for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); @@ -1480,8 +2401,10 @@ static int get_port_caps(struct mlx5_ib_dev *dev) port, err); break; } - dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys; - dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len; + dev->mdev->port_caps[port - 1].pkey_table_len = + dprops->max_pkeys; + dev->mdev->port_caps[port - 1].gid_table_len = + pprops->gid_tbl_len; mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", dprops->max_pkeys, pprops->gid_tbl_len); } @@ -1501,7 +2424,8 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev) if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); - ib_dereg_mr(dev->umrc.mr); + mlx5_ib_destroy_qp(dev->umrc.qp); + ib_free_cq(dev->umrc.cq); ib_dealloc_pd(dev->umrc.pd); } @@ -1511,40 +2435,107 @@ enum { static int create_umr_res(struct mlx5_ib_dev *dev) { + struct ib_qp_init_attr *init_attr = NULL; + struct ib_qp_attr *attr = NULL; struct ib_pd *pd; - struct ib_mr *mr; + struct ib_cq *cq; + struct ib_qp *qp; int ret; - pd = ib_alloc_pd(&dev->ib_dev); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto error_0; + } + + pd = ib_alloc_pd(&dev->ib_dev, 0); if (IS_ERR(pd)) { mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); ret = PTR_ERR(pd); goto error_0; } - mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(mr)) { - mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n"); - ret = PTR_ERR(mr); - goto error_1; + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto error_2; } - dev->umrc.mr = mr; + init_attr->send_cq = cq; + init_attr->recv_cq = cq; + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->cap.max_send_wr = MAX_UMR_WR; + init_attr->cap.max_send_sge = 1; + init_attr->qp_type = MLX5_IB_QPT_REG_UMR; + init_attr->port_num = 1; + qp = mlx5_ib_create_qp(pd, init_attr, NULL); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto error_3; + } + qp->device = &dev->ib_dev; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = MLX5_IB_QPT_REG_UMR; + + attr->qp_state = IB_QPS_INIT; + attr->port_num = 1; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_PORT, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTR; + attr->path_mtu = IB_MTU_256; + + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + goto error_4; + } + + memset(attr, 0, sizeof(*attr)); + attr->qp_state = IB_QPS_RTS; + ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + goto error_4; + } + + dev->umrc.qp = qp; + dev->umrc.cq = cq; dev->umrc.pd = pd; + sema_init(&dev->umrc.sem, MAX_UMR_WR); ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); goto error_4; } + kfree(attr); + kfree(init_attr); + return 0; error_4: - ib_dereg_mr(mr); -error_1: + mlx5_ib_destroy_qp(qp); + +error_3: + ib_free_cq(cq); + +error_2: ib_dealloc_pd(pd); + error_0: + kfree(attr); + kfree(init_attr); return ret; } @@ -1552,11 +2543,14 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) { struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; + struct ib_cq_init_attr cq_attr = {.cqe = 1}; + int port; int ret = 0; - struct ib_cq_init_attr cq_attr = { .cqe = 1 }; dev = container_of(devr, struct mlx5_ib_dev, devr); + mutex_init(&devr->mutex); + devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); @@ -1618,7 +2612,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) devr->s0->event_handler = NULL; devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; - devr->s0->ext.xrc.xrcd = devr->x0; + devr->s0->ext.xrc.xrcd = devr->x0; devr->s0->ext.xrc.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); atomic_inc(&devr->s0->ext.xrc.cq->usecnt); @@ -1642,7 +2636,13 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) devr->s1->srq_type = IB_SRQT_BASIC; devr->s1->ext.xrc.cq = devr->c0; atomic_inc(&devr->p0->usecnt); - atomic_set(&devr->s1->usecnt, 0); + atomic_set(&devr->s0->usecnt, 0); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + devr->ports[port].devr = devr; + } return 0; @@ -1662,12 +2662,20 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) static void destroy_dev_resources(struct mlx5_ib_resources *devr) { + struct mlx5_ib_dev *dev = + container_of(devr, struct mlx5_ib_dev, devr); + int port; + mlx5_ib_destroy_srq(devr->s1); mlx5_ib_destroy_srq(devr->s0); mlx5_ib_dealloc_xrcd(devr->x0); mlx5_ib_dealloc_xrcd(devr->x1); mlx5_ib_destroy_cq(devr->c0); mlx5_ib_dealloc_pd(devr->p0); + + /* Make sure no change P_Key work items are still executing */ + for (port = 0; port < dev->num_ports; ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } static u32 get_core_cap_flags(struct ib_device *ibdev) @@ -1681,13 +2689,11 @@ static u32 get_core_cap_flags(struct ib_device *ibdev) if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; - ret = RDMA_CORE_PORT_RAW_PACKET; - if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) - return ret; + return 0; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) - return ret; + return 0; if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE; @@ -1702,117 +2708,99 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; - struct mlx5_ib_dev *dev = to_mdev(ibdev); - enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); int err; - immutable->core_cap_flags = get_core_cap_flags(ibdev); - - err = ib_query_port(ibdev, port_num, &attr); + err = mlx5_ib_query_port(ibdev, port_num, &attr); if (err) return err; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->core_cap_flags = get_core_cap_flags(ibdev); - if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce)) - immutable->max_mad_size = IB_MGMT_MAD_SIZE; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } -static void enable_dc_tracer(struct mlx5_ib_dev *dev) +static void get_dev_fw_str(struct ib_device *ibdev, char *str, + size_t str_len) { - struct device *device = dev->ib_dev.dma_device; - struct mlx5_dc_tracer *dct = &dev->dctr; - int order; - void *tmp; - int size; - int err; - - size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096; - if (size <= PAGE_SIZE) - order = 0; - else - order = 1; - - dct->pg = alloc_pages(GFP_KERNEL, order); - if (!dct->pg) { - mlx5_ib_err(dev, "failed to allocate %d pages\n", order); - return; - } - - tmp = page_address(dct->pg); - memset(tmp, 0xff, size); - - dct->size = size; - dct->order = order; - dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE); - if (dma_mapping_error(device, dct->dma)) { - mlx5_ib_err(dev, "dma mapping error\n"); - goto map_err; - } - - err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma); - if (err) { - mlx5_ib_warn(dev, "failed to enable DC tracer\n"); - goto cmd_err; - } - - return; - -cmd_err: - dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE); -map_err: - __free_pages(dct->pg, dct->order); - dct->pg = NULL; + struct mlx5_ib_dev *dev = + container_of(ibdev, struct mlx5_ib_dev, ib_dev); + snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), + fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } -static void disable_dc_tracer(struct mlx5_ib_dev *dev) +static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev) { - struct device *device = dev->ib_dev.dma_device; - struct mlx5_dc_tracer *dct = &dev->dctr; - int err; - - if (!dct->pg) - return; - - err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma); - if (err) { - mlx5_ib_warn(dev, "failed to disable DC tracer\n"); - return; - } - - dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE); - __free_pages(dct->pg, dct->order); - dct->pg = NULL; -} - -enum { - MLX5_DC_CNAK_SIZE = 128, - MLX5_NUM_BUF_IN_PAGE = PAGE_SIZE / MLX5_DC_CNAK_SIZE, - MLX5_CNAK_TX_CQ_SIGNAL_FACTOR = 128, - MLX5_DC_CNAK_SL = 0, - MLX5_DC_CNAK_VL = 0, -}; - -static int init_dc_improvements(struct mlx5_ib_dev *dev) -{ - if (!mlx5_core_is_pf(dev->mdev)) - return 0; - - if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace))) - return 0; - - enable_dc_tracer(dev); - return 0; } -static void cleanup_dc_improvements(struct mlx5_ib_dev *dev) +static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev) { +} - disable_dc_tracer(dev); +static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev) +{ + if (dev->roce.nb.notifier_call) { + unregister_netdevice_notifier(&dev->roce.nb); + dev->roce.nb.notifier_call = NULL; + } +} + +static int mlx5_enable_roce(struct mlx5_ib_dev *dev) +{ + VNET_ITERATOR_DECL(vnet_iter); + struct net_device *idev; + int err; + + /* Check if mlx5en net device already exists */ + VNET_LIST_RLOCK(); + VNET_FOREACH(vnet_iter) { + IFNET_RLOCK(); + CURVNET_SET_QUIET(vnet_iter); + TAILQ_FOREACH(idev, &V_ifnet, if_link) { + /* check if network interface belongs to mlx5en */ + if (!mlx5_netdev_match(idev, dev->mdev, "mce")) + continue; + write_lock(&dev->roce.netdev_lock); + dev->roce.netdev = idev; + write_unlock(&dev->roce.netdev_lock); + } + CURVNET_RESTORE(); + IFNET_RUNLOCK(); + } + VNET_LIST_RUNLOCK(); + + dev->roce.nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->roce.nb); + if (err) { + dev->roce.nb.notifier_call = NULL; + return err; + } + + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + goto err_unregister_netdevice_notifier; + + err = mlx5_roce_lag_init(dev); + if (err) + goto err_disable_roce; + + return 0; + +err_disable_roce: + mlx5_nic_vport_disable_roce(dev->mdev); + +err_unregister_netdevice_notifier: + mlx5_remove_roce_notifier(dev); + return err; +} + +static void mlx5_disable_roce(struct mlx5_ib_dev *dev) +{ + mlx5_roce_lag_cleanup(dev); + mlx5_nic_vport_disable_roce(dev->mdev); } static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num) @@ -1842,8 +2830,8 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) &dev->port[i].q_cnt_id); if (ret) { mlx5_ib_warn(dev, - "couldn't allocate queue counter for port %d\n", - i + 1); + "couldn't allocate queue counter for port %d, err %d\n", + i + 1, ret); goto dealloc_counters; } } @@ -1857,199 +2845,93 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) return ret; } -struct port_attribute { - struct attribute attr; - ssize_t (*show)(struct mlx5_ib_port *, - struct port_attribute *, char *buf); - ssize_t (*store)(struct mlx5_ib_port *, - struct port_attribute *, - const char *buf, size_t count); +static const char * const names[] = { + "rx_write_requests", + "rx_read_requests", + "rx_atomic_requests", + "out_of_buffer", + "out_of_sequence", + "duplicate_request", + "rnr_nak_retry_err", + "packet_seq_err", + "implied_nak_seq_err", + "local_ack_timeout_err", }; -struct port_counter_attribute { - struct port_attribute attr; - size_t offset; +static const size_t stats_offsets[] = { + MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests), + MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests), + MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests), + MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer), + MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence), + MLX5_BYTE_OFF(query_q_counter_out, duplicate_request), + MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err), + MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err), + MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err), + MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err), }; -static ssize_t port_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) +static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) { - struct port_attribute *port_attr = - container_of(attr, struct port_attribute, attr); - struct mlx5_ib_port_sysfs_group *p = - container_of(kobj, struct mlx5_ib_port_sysfs_group, - kobj); - struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port, - group); + BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets)); - if (!port_attr->show) - return -EIO; + /* We support only per port stats */ + if (port_num == 0) + return NULL; - return port_attr->show(mibport, port_attr, buf); + return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names), + RDMA_HW_STATS_DEFAULT_LIFESPAN); } -static ssize_t show_port_counter(struct mlx5_ib_port *p, - struct port_attribute *port_attr, - char *buf) +static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port, int index) { + struct mlx5_ib_dev *dev = to_mdev(ibdev); int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); - struct port_counter_attribute *counter_attr = - container_of(port_attr, struct port_counter_attribute, attr); void *out; + __be32 val; int ret; + int i; + + if (!port || !stats) + return -ENOSYS; out = mlx5_vzalloc(outlen); if (!out) return -ENOMEM; - ret = mlx5_vport_query_q_counter(p->dev->mdev, - p->q_cnt_id, 0, - out, outlen); + ret = mlx5_vport_query_q_counter(dev->mdev, + dev->port[port - 1].q_cnt_id, 0, + out, outlen); if (ret) goto free; - ret = sprintf(buf, "%d\n", - be32_to_cpu(*(__be32 *)(out + counter_attr->offset))); - + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = *(__be32 *)(out + stats_offsets[i]); + stats->value[i] = (u64)be32_to_cpu(val); + } free: - kfree(out); - return ret; -} - -#define PORT_COUNTER_ATTR(_name) \ -struct port_counter_attribute port_counter_attr_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_port_counter, NULL), \ - .offset = MLX5_BYTE_OFF(query_q_counter_out, _name) \ -} - -static PORT_COUNTER_ATTR(rx_write_requests); -static PORT_COUNTER_ATTR(rx_read_requests); -static PORT_COUNTER_ATTR(rx_atomic_requests); -static PORT_COUNTER_ATTR(rx_dct_connect); -static PORT_COUNTER_ATTR(out_of_buffer); -static PORT_COUNTER_ATTR(out_of_sequence); -static PORT_COUNTER_ATTR(duplicate_request); -static PORT_COUNTER_ATTR(rnr_nak_retry_err); -static PORT_COUNTER_ATTR(packet_seq_err); -static PORT_COUNTER_ATTR(implied_nak_seq_err); -static PORT_COUNTER_ATTR(local_ack_timeout_err); - -static struct attribute *counter_attrs[] = { - &port_counter_attr_rx_write_requests.attr.attr, - &port_counter_attr_rx_read_requests.attr.attr, - &port_counter_attr_rx_atomic_requests.attr.attr, - &port_counter_attr_rx_dct_connect.attr.attr, - &port_counter_attr_out_of_buffer.attr.attr, - &port_counter_attr_out_of_sequence.attr.attr, - &port_counter_attr_duplicate_request.attr.attr, - &port_counter_attr_rnr_nak_retry_err.attr.attr, - &port_counter_attr_packet_seq_err.attr.attr, - &port_counter_attr_implied_nak_seq_err.attr.attr, - &port_counter_attr_local_ack_timeout_err.attr.attr, - NULL -}; - -static struct attribute_group port_counters_group = { - .name = "counters", - .attrs = counter_attrs -}; - -static const struct sysfs_ops port_sysfs_ops = { - .show = port_attr_show -}; - -static struct kobj_type port_type = { - .sysfs_ops = &port_sysfs_ops, -}; - -static int add_port_attrs(struct mlx5_ib_dev *dev, - struct kobject *parent, - struct mlx5_ib_port_sysfs_group *port, - u8 port_num) -{ - int ret; - - ret = kobject_init_and_add(&port->kobj, &port_type, - parent, - "%d", port_num); - if (ret) - return ret; - - if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && - MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { - ret = sysfs_create_group(&port->kobj, &port_counters_group); - if (ret) - goto put_kobj; - } - - port->enabled = true; - return ret; - -put_kobj: - kobject_put(&port->kobj); - return ret; -} - -static void destroy_ports_attrs(struct mlx5_ib_dev *dev, - unsigned int num_ports) -{ - unsigned int i; - - for (i = 0; i < num_ports; i++) { - struct mlx5_ib_port_sysfs_group *port = - &dev->port[i].group; - - if (!port->enabled) - continue; - - if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && - MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) - sysfs_remove_group(&port->kobj, - &port_counters_group); - kobject_put(&port->kobj); - port->enabled = false; - } - - if (dev->ports_parent) { - kobject_put(dev->ports_parent); - dev->ports_parent = NULL; - } -} - -static int create_port_attrs(struct mlx5_ib_dev *dev) -{ - int ret = 0; - unsigned int i = 0; - struct device *device = &dev->ib_dev.dev; - - dev->ports_parent = kobject_create_and_add("mlx5_ports", - &device->kobj); - if (!dev->ports_parent) - return -ENOMEM; - - for (i = 0; i < dev->num_ports; i++) { - ret = add_port_attrs(dev, - dev->ports_parent, - &dev->port[i].group, - i + 1); - - if (ret) - goto _destroy_ports_attrs; - } - - return 0; - -_destroy_ports_attrs: - destroy_ports_attrs(dev, i); - return ret; + kvfree(out); + return ARRAY_SIZE(names); } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; + enum rdma_link_layer ll; + int port_type_cap; + const char *name; int err; int i; + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce)) + return NULL; + printk_once(KERN_INFO "%s", mlx5_version); dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); @@ -2059,17 +2941,11 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), - GFP_KERNEL); + GFP_KERNEL); if (!dev->port) goto err_dealloc; - for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { - dev->port[i].dev = dev; - dev->port[i].port_num = i; - dev->port[i].port_gone = 0; - memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table)); - } - + rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_free_port; @@ -2077,23 +2953,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET) { - if (MLX5_CAP_GEN(mdev, roce)) { - err = mlx5_nic_vport_enable_roce(mdev); - if (err) - goto err_free_port; - } else { - goto err_free_port; - } - } - MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); - strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); + name = "mlx5_%d"; + + strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; - dev->ib_dev.local_dma_lkey = mdev->special_contexts.resd_lkey; + dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = @@ -2108,6 +2975,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | @@ -2125,12 +2993,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); + dev->ib_dev.uverbs_ex_cmd_mask = + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; - dev->ib_dev.get_netdev = mlx5_ib_get_netdev; + if (ll == IB_LINK_LAYER_ETHERNET) + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.add_gid = mlx5_ib_add_gid; + dev->ib_dev.del_gid = mlx5_ib_del_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; @@ -2161,15 +3036,40 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.req_notify_cq = mlx5_ib_arm_cq; dev->ib_dev.get_dma_mr = mlx5_ib_get_dma_mr; dev->ib_dev.reg_user_mr = mlx5_ib_reg_user_mr; - dev->ib_dev.reg_phys_mr = mlx5_ib_reg_phys_mr; + dev->ib_dev.rereg_user_mr = mlx5_ib_rereg_user_mr; dev->ib_dev.dereg_mr = mlx5_ib_dereg_mr; dev->ib_dev.attach_mcast = mlx5_ib_mcg_attach; dev->ib_dev.detach_mcast = mlx5_ib_mcg_detach; dev->ib_dev.process_mad = mlx5_ib_process_mad; + dev->ib_dev.alloc_mr = mlx5_ib_alloc_mr; + dev->ib_dev.map_mr_sg = mlx5_ib_map_mr_sg; + dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; dev->ib_dev.get_port_immutable = mlx5_port_immutable; - dev->ib_dev.alloc_fast_reg_mr = mlx5_ib_alloc_fast_reg_mr; - dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list; - dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; + dev->ib_dev.get_dev_fw_str = get_dev_fw_str; + if (mlx5_core_is_pf(mdev)) { + dev->ib_dev.get_vf_config = mlx5_ib_get_vf_config; + dev->ib_dev.set_vf_link_state = mlx5_ib_set_vf_link_state; + dev->ib_dev.get_vf_stats = mlx5_ib_get_vf_stats; + dev->ib_dev.set_vf_guid = mlx5_ib_set_vf_guid; + } + + dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; + + mlx5_ib_internal_fill_odp_caps(dev); + + if (MLX5_CAP_GEN(mdev, imaicl)) { + dev->ib_dev.alloc_mw = mlx5_ib_alloc_mw; + dev->ib_dev.dealloc_mw = mlx5_ib_dealloc_mw; + dev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); + } + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && + MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; + dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; + } if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; @@ -2179,18 +3079,46 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) { + dev->ib_dev.create_flow = mlx5_ib_create_flow; + dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; + dev->ib_dev.create_wq = mlx5_ib_create_wq; + dev->ib_dev.modify_wq = mlx5_ib_modify_wq; + dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; + dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; + dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; + dev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); + } err = init_node_data(dev); if (err) - goto err_disable_roce; + goto err_free_port; + mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + if (ll == IB_LINK_LAYER_ETHERNET) { + err = mlx5_enable_roce(dev); + if (err) + goto err_free_port; + } + err = create_dev_resources(&dev->devr); if (err) goto err_disable_roce; + err = mlx5_ib_odp_init_one(dev); + if (err) + goto err_rsrc; err = mlx5_ib_alloc_q_counters(dev); if (err) @@ -2204,44 +3132,18 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_dev; - if (MLX5_CAP_GEN(dev->mdev, port_type) == - MLX5_CAP_PORT_TYPE_IB) { - if (init_dc_improvements(dev)) - mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n"); - } - - err = create_port_attrs(dev); - if (err) - goto err_dc; - for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) - goto err_port_attrs; - } - - if (1) { - struct thread *rl_thread = NULL; - struct proc *rl_proc = NULL; - - for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { - (void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread, - RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i); - } + goto err_umrc; } dev->ib_active = true; return dev; -err_port_attrs: - destroy_ports_attrs(dev, dev->num_ports); - -err_dc: - if (MLX5_CAP_GEN(dev->mdev, port_type) == - MLX5_CAP_PORT_TYPE_IB) - cleanup_dc_improvements(dev); +err_umrc: destroy_umrc_res(dev); err_dev: @@ -2251,12 +3153,17 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) mlx5_ib_dealloc_q_counters(dev); err_odp: + mlx5_ib_odp_remove_one(dev); + +err_rsrc: destroy_dev_resources(&dev->devr); err_disable_roce: - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) - mlx5_nic_vport_disable_roce(mdev); + if (ll == IB_LINK_LAYER_ETHERNET) { + mlx5_disable_roce(dev); + mlx5_remove_roce_notifier(dev); + } + err_free_port: kfree(dev->port); @@ -2269,32 +3176,16 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; - int i; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); - for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) { - dev->port[i].port_gone = 1; - while (dev->port[i].port_gone != 2) - pause("W", hz); - } - - for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { - device_remove_file(&dev->ib_dev.dev, - mlx5_class_attributes[i]); - } - - destroy_ports_attrs(dev, dev->num_ports); - if (MLX5_CAP_GEN(dev->mdev, port_type) == - MLX5_CAP_PORT_TYPE_IB) - cleanup_dc_improvements(dev); - mlx5_ib_dealloc_q_counters(dev); + mlx5_remove_roce_notifier(dev); ib_unregister_device(&dev->ib_dev); + mlx5_ib_dealloc_q_counters(dev); destroy_umrc_res(dev); + mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); - - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce)) - mlx5_nic_vport_disable_roce(mdev); - + if (ll == IB_LINK_LAYER_ETHERNET) + mlx5_disable_roce(dev); kfree(dev->port); ib_dealloc_device(&dev->ib_dev); } @@ -2311,31 +3202,27 @@ static int __init mlx5_ib_init(void) int err; if (deprecated_prof_sel != 2) - printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); + + err = mlx5_ib_odp_init(); + if (err) + return err; err = mlx5_register_interface(&mlx5_ib_interface); if (err) goto clean_odp; - mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq"); - if (!mlx5_ib_wq) { - printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__); - goto err_unreg; - } - return err; -err_unreg: - mlx5_unregister_interface(&mlx5_ib_interface); - clean_odp: + mlx5_ib_odp_cleanup(); return err; } static void __exit mlx5_ib_cleanup(void) { - destroy_workqueue(mlx5_ib_wq); mlx5_unregister_interface(&mlx5_ib_interface); + mlx5_ib_odp_cleanup(); } module_init_order(mlx5_ib_init, SI_ORDER_THIRD); diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c index f942f8b9e8ce..58f985b33ff4 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mem.c @@ -27,10 +27,9 @@ #include #include +#include #include "mlx5_ib.h" -CTASSERT(sizeof(uintptr_t) == sizeof(unsigned long)); - /* @umem: umem object to scan * @addr: ib virtual address requested by the user * @count: number of PAGE_SIZE pages covered by umem @@ -38,7 +37,6 @@ CTASSERT(sizeof(uintptr_t) == sizeof(unsigned long)); * @ncont: number of compund pages * @order: log2 of the number of compound pages */ - void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int *ncont, int *order) { @@ -55,29 +53,38 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, int entry; unsigned long page_shift = ilog2(umem->page_size); + /* With ODP we must always match OS page size. */ + if (umem->odp_data) { + *count = ib_umem_page_count(umem); + *shift = PAGE_SHIFT; + *ncont = *count; + if (order) + *order = ilog2(roundup_pow_of_two(*count)); + + return; + } + addr = addr >> page_shift; - tmp = (uintptr_t)addr; - m = find_first_bit(&tmp, 8 * sizeof(tmp)); + tmp = (unsigned long)addr; + m = find_first_bit(&tmp, BITS_PER_LONG); skip = 1 << m; mask = skip - 1; i = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> page_shift; pfn = sg_dma_address(sg) >> page_shift; for (k = 0; k < len; k++) { if (!(i & mask)) { - tmp = (uintptr_t)pfn; - m = min_t(unsigned long, m, - find_first_bit(&tmp, 8 * sizeof(tmp))); + tmp = (unsigned long)pfn; + m = min_t(unsigned long, m, find_first_bit(&tmp, BITS_PER_LONG)); skip = 1 << m; mask = skip - 1; base = pfn; p = 0; } else { if (base + p != pfn) { - tmp = (uintptr_t)p; - m = find_first_bit(&tmp, 8 * sizeof(tmp)); + tmp = (unsigned long)p; + m = find_first_bit(&tmp, BITS_PER_LONG); skip = 1 << m; mask = skip - 1; base = pfn; @@ -108,6 +115,20 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, *count = i; } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static u64 umem_dma_to_mtt(dma_addr_t umem_dma) +{ + u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; + + if (umem_dma & ODP_READ_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_READ; + if (umem_dma & ODP_WRITE_ALLOWED_BIT) + mtt_entry |= MLX5_IB_MTT_WRITE; + + return mtt_entry; +} +#endif + /* * Populate the given array with bus addresses from the umem. * @@ -121,8 +142,8 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, * access_flags - access flags to set on all present pages. use enum mlx5_ib_mtt_access_flags for this. */ -static void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int page_shift, size_t offset, +void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int page_shift, size_t offset, size_t num_pages, __be64 *pas, int access_flags) { unsigned long umem_page_shift = ilog2(umem->page_size); @@ -134,6 +155,21 @@ static void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem int len; struct scatterlist *sg; int entry; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + const bool odp = umem->odp_data != NULL; + + if (odp) { + WARN_ON(shift != 0); + WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); + + for (i = 0; i < num_pages; ++i) { + dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + + pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + } + return; + } +#endif i = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { @@ -146,12 +182,10 @@ static void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem pas[i >> shift] = cpu_to_be64(cur); mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", - i >> shift, (unsigned long long) - be64_to_cpu(pas[i >> shift])); + i >> shift, (long long)be64_to_cpu(pas[i >> shift])); } else mlx5_ib_dbg(dev, "=====> 0x%llx\n", - (unsigned long long) - (base + (k << umem_page_shift))); + (long long)(base + (k << umem_page_shift))); i++; } } @@ -161,10 +195,9 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int access_flags) { return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, - pas, + ib_umem_num_pages(umem), pas, access_flags); } - int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) { u64 page_size; @@ -182,6 +215,6 @@ int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) if (buf_off & off_mask) return -EINVAL; - *offset = (u32)(buf_off >> ilog2(off_size)); + *offset = buf_off >> ilog2(off_size); return 0; } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c index 72dcf6311804..4e893532d480 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c @@ -25,29 +25,37 @@ * $FreeBSD$ */ - #include #include -#include #include +#include #include +#include +#include #include "mlx5_ib.h" -CTASSERT((uintptr_t)PAGE_MASK > (uintptr_t)PAGE_SIZE); - enum { MAX_PENDING_REG_MR = 8, - MAX_MR_RELEASE_TIMEOUT = (60 * 20) /* Allow release timeout up to 20 min */ }; #define MLX5_UMR_ALIGN 2048 +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static __be64 mlx5_ib_update_mtt_emergency_buffer[ + MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] + __aligned(MLX5_UMR_ALIGN); +static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); +#endif -static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev); -static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev); +static int clean_mr(struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); + int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* Wait until all page fault handlers using the mr complete. */ + synchronize_srcu(&dev->mr_srcu); +#endif return err; } @@ -62,6 +70,40 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) +{ + return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= + length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +static void update_odp_mr(struct mlx5_ib_mr *mr) +{ + if (mr->umem->odp_data) { + /* + * This barrier prevents the compiler from moving the + * setting of umem->odp_data->private to point to our + * MR, before reg_umr finished, to ensure that the MR + * initialization have finished before starting to + * handle invalidations. + */ + smp_wmb(); + mr->umem->odp_data->private = mr; + /* + * Make sure we will see the new + * umem->odp_data->private value in the invalidation + * routines, before we can get page faults on the + * MR. Page faults can happen once we put the MR in + * the tree, below this line. Without the barrier, + * there can be a fault handling and an invalidation + * before umem->odp_data->private == mr is visible to + * the invalidation handler. + */ + smp_wmb(); + } +} +#endif + static void reg_mr_callback(int status, void *context) { struct mlx5_ib_mr *mr = context; @@ -69,28 +111,16 @@ static void reg_mr_callback(int status, void *context) struct mlx5_mr_cache *cache = &dev->cache; int c = order2idx(dev, mr->order); struct mlx5_cache_ent *ent = &cache->ent[c]; - struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_core_mr *mmr = &mr->mmr; - struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; - unsigned long flags; - int err; u8 key; + unsigned long flags; + struct mlx5_mr_table *table = &dev->mdev->priv.mr_table; + int err; spin_lock_irqsave(&ent->lock, flags); ent->pending--; spin_unlock_irqrestore(&ent->lock, flags); if (status) { - mlx5_ib_warn(dev, "async reg mr failed. status %d, order %d\n", status, ent->order); - kfree(mr); - dev->fill_delay = 1; - mod_timer(&dev->delay_timer, jiffies + HZ); - return; - } - - if (mr->out.hdr.status) { - mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", - mr->out.hdr.status, - be32_to_cpu(mr->out.hdr.syndrome)); + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); kfree(mr); dev->fill_delay = 1; mod_timer(&dev->delay_timer, jiffies + HZ); @@ -100,9 +130,7 @@ static void reg_mr_callback(int status, void *context) spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); - mmr->key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; - mlx5_ib_dbg(dev, "callbacked mkey 0x%x created\n", - be32_to_cpu(mr->out.mkey)); + mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key; cache->last_add = jiffies; @@ -113,29 +141,30 @@ static void reg_mr_callback(int status, void *context) spin_unlock_irqrestore(&ent->lock, flags); spin_lock_irqsave(&table->lock, flags); - err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mmr->key), mmr); + err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mr->mmkey.key), + &mr->mmkey); + if (err) + pr_err("Error inserting to mkey tree. 0x%x\n", -err); spin_unlock_irqrestore(&table->lock, flags); - if (err) { - mlx5_ib_warn(dev, "failed radix tree insert of mkey 0x%x, %d\n", - mmr->key, err); - mlx5_core_destroy_mkey(mdev, mmr); - } } static int add_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; - struct mlx5_create_mkey_mbox_in *in; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; int npages = 1 << ent->order; + void *mkc; + u32 *in; int err = 0; int i; - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); for (i = 0; i < num; i++) { if (ent->pending >= MAX_PENDING_REG_MR) { err = -EAGAIN; @@ -150,18 +179,22 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) mr->order = ent->order; mr->umred = 1; mr->dev = dev; - in->seg.status = MLX5_MKEY_STATUS_FREE; - in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; - in->seg.log2_page_size = 12; + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2); + MLX5_SET(mkc, mkc, log_page_size, 12); spin_lock_irq(&ent->lock); ent->pending++; spin_unlock_irq(&ent->lock); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, - sizeof(*in), reg_mr_callback, - mr, &mr->out); + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, + (struct mlx5_create_mkey_mbox_in *)in, + inlen, reg_mr_callback, mr, + (struct mlx5_create_mkey_mbox_out *)mr->out); if (err) { spin_lock_irq(&ent->lock); ent->pending--; @@ -215,25 +248,12 @@ static int someone_adding(struct mlx5_mr_cache *cache) return 0; } -static int someone_releasing(struct mlx5_mr_cache *cache) -{ - int i; - - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - if (cache->ent[i].cur > 2 * cache->ent[i].limit) - return 1; - } - - return 0; -} - static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; struct mlx5_mr_cache *cache = &dev->cache; int i = order2idx(dev, ent->order); int err; - s64 dtime; if (cache->stopped) return; @@ -245,39 +265,38 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) if (err == -EAGAIN) { mlx5_ib_dbg(dev, "returned eagain, order %d\n", i + 2); - cancel_delayed_work(&ent->dwork); - if (!queue_delayed_work(cache->wq, &ent->dwork, - msecs_to_jiffies(3))) - mlx5_ib_warn(dev, "failed queueing delayed work\n"); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); } else if (err) { mlx5_ib_warn(dev, "command failed order %d, err %d\n", i + 2, err); - cancel_delayed_work(&ent->dwork); - if (!queue_delayed_work(cache->wq, &ent->dwork, - msecs_to_jiffies(1000))) - mlx5_ib_warn(dev, "failed queueing delayed work\n"); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); } else { - if (!queue_work(cache->wq, &ent->work)) - mlx5_ib_warn(dev, "failed queueing work\n"); + queue_work(cache->wq, &ent->work); } } } else if (ent->cur > 2 * ent->limit) { - dtime = (cache->last_add + (s64)cache->rel_timeout * HZ) - jiffies; - if (cache->rel_imm || - (cache->rel_timeout >= 0 && !someone_adding(cache) && dtime <= 0)) { + /* + * The remove_keys() logic is performed as garbage collection + * task. Such task is intended to be run when no other active + * processes are running. + * + * The need_resched() will return TRUE if there are user tasks + * to be activated in near future. + * + * In such case, we don't execute remove_keys() and postpone + * the garbage collection work to try to run in next cycle, + * in order to free CPU resources to other tasks. + */ + if (!need_resched() && !someone_adding(cache) && + time_after(jiffies, cache->last_add + 300 * HZ)) { remove_keys(dev, i, 1); if (ent->cur > ent->limit) - if (!queue_work(cache->wq, &ent->work)) - mlx5_ib_warn(dev, "failed queueing work\n"); - } else if (cache->rel_timeout >= 0) { - dtime = max_t(s64, dtime, 0); - dtime = min_t(s64, dtime, (MAX_MR_RELEASE_TIMEOUT * HZ)); - cancel_delayed_work(&ent->dwork); - if (!queue_delayed_work(cache->wq, &ent->dwork, dtime)) - mlx5_ib_warn(dev, "failed queueing delayed work\n"); + queue_work(cache->wq, &ent->work); + } else { + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); } - } else if (cache->rel_imm && !someone_releasing(cache)) { - cache->rel_imm = 0; } } @@ -297,6 +316,47 @@ static void cache_work_func(struct work_struct *work) __cache_work_func(ent); } +static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_mr *mr = NULL; + struct mlx5_cache_ent *ent; + int c; + int i; + + c = order2idx(dev, order); + if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); + return NULL; + } + + for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + ent = &cache->ent[i]; + + mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + + spin_lock_irq(&ent->lock); + if (!list_empty(&ent->head)) { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + break; + } + spin_unlock_irq(&ent->lock); + + queue_work(cache->wq, &ent->work); + } + + if (!mr) + cache->ent[c].miss++; + + return mr; +} + static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_mr_cache *cache = &dev->cache; @@ -318,8 +378,7 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) spin_unlock_irq(&ent->lock); if (shrink) - if (!queue_work(cache->wq, &ent->work)) - mlx5_ib_warn(dev, "failed queueing work\n"); + queue_work(cache->wq, &ent->work); } static void clean_keys(struct mlx5_ib_dev *dev, int c) @@ -343,8 +402,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) spin_unlock_irq(&ent->lock); err = destroy_mkey(dev, mr); if (err) - mlx5_ib_warn(dev, "failed destroy mkey 0x%x from order %d\n", - mr->mmr.key, ent->order); + mlx5_ib_warn(dev, "failed destroy mkey\n"); else kfree(mr); } @@ -357,27 +415,21 @@ static void delay_time_func(unsigned long ctx) dev->fill_delay = 0; } -enum { - MLX5_VF_MR_LIMIT = 2, -}; - int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int limit; - int err; int i; mutex_init(&dev->slow_path_mutex); - cache->rel_timeout = 300; - cache->wq = create_singlethread_workqueue("mkey_cache"); + cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); return -ENOMEM; } - setup_timer(&dev->delay_timer, delay_time_func, (uintptr_t)dev); + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { INIT_LIST_HEAD(&cache->ent[i].head); spin_lock_init(&cache->ent[i].lock); @@ -388,106 +440,79 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->order = i + 2; ent->dev = dev; - if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) { - if (mlx5_core_is_pf(dev->mdev)) - limit = dev->mdev->profile->mr_cache[i].limit; - else - limit = MLX5_VF_MR_LIMIT; - } else { + if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) + limit = dev->mdev->profile->mr_cache[i].limit; + else limit = 0; - } INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ent->limit = limit; - if (!queue_work(cache->wq, &ent->work)) - mlx5_ib_warn(dev, "failed queueing work\n"); + queue_work(cache->wq, &ent->work); } - err = mlx5_mr_sysfs_init(dev); - if (err) - mlx5_ib_warn(dev, "failed to init mr cache sysfs\n"); - return 0; } -static void wait_for_async_commands(struct mlx5_ib_dev *dev) -{ - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent; - int total = 0; - int i; - int j; - - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - ent = &cache->ent[i]; - for (j = 0 ; j < 1000; j++) { - if (!ent->pending) - break; - msleep(50); - } - } - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - ent = &cache->ent[i]; - total += ent->pending; - } - - if (total) - mlx5_ib_dbg(dev, "aborted, %d pending requests\n", total); - else - mlx5_ib_dbg(dev, "done with all pending requests\n"); -} - int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) { int i; dev->cache.stopped = 1; flush_workqueue(dev->cache.wq); - mlx5_mr_sysfs_cleanup(dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) clean_keys(dev, i); destroy_workqueue(dev->cache.wq); - wait_for_async_commands(dev); del_timer_sync(&dev->delay_timer); + return 0; } struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_create_mkey_mbox_in *in; - struct mlx5_mkey_seg *seg; struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } - seg = &in->seg; - seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA; - seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64); - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - seg->start_addr = 0; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, - NULL); + MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET64(mkc, mkc, start_addr, 0); + + err = mlx5_core_create_mkey(mdev, &mr->mmkey, + (struct mlx5_create_mkey_mbox_in *)in, + inlen, NULL, NULL, NULL); if (err) goto err_in; kfree(in); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; return &mr->ibmr; @@ -501,88 +526,415 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) return ERR_PTR(err); } -static int get_octo_len(u64 addr, u64 len, u64 page_size) +static int get_octo_len(u64 addr, u64 len, int page_size) { u64 offset; int npages; - offset = addr & (page_size - 1ULL); + offset = addr & (page_size - 1); npages = ALIGN(len + offset, page_size) >> ilog2(page_size); return (npages + 1) / 2; } -void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) +static int use_umr(int order) { - struct mlx5_ib_umr_context *context; - struct ib_wc wc; - int err; - - while (1) { - err = ib_poll_cq(cq, 1, &wc); - if (err < 0) { - printf("mlx5_ib: WARN: ""poll cq error %d\n", err); - return; - } - if (err == 0) - break; - - context = (struct mlx5_ib_umr_context *)(uintptr_t)wc.wr_id; - context->status = wc.status; - complete(&context->done); - } - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + return order <= MLX5_MAX_UMR_SHIFT; } -static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, - u64 length, struct ib_umem *umem, - int npages, int page_shift, - int access_flags) +static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, + int npages, int page_shift, int *size, + __be64 **mr_pas, dma_addr_t *dma) +{ + __be64 *pas; + struct device *ddev = dev->ib_dev.dma_device; + + /* + * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. + * To avoid copying garbage after the pas array, we allocate + * a little more. + */ + *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); + *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); + if (!(*mr_pas)) + return -ENOMEM; + + pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); + /* Clear padding after the actual pages. */ + memset(pas + npages, 0, *size - npages * sizeof(u64)); + + *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, *dma)) { + kfree(*mr_pas); + return -ENOMEM; + } + + return 0; +} + +static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_create_mkey_mbox_in *in; + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + sg->addr = dma; + sg->length = ALIGN(sizeof(u64) * n, 64); + sg->lkey = dev->umrc.pd->local_dma_lkey; + + wr->next = NULL; + wr->sg_list = sg; + if (n) + wr->num_sge = 1; + else + wr->num_sge = 0; + + wr->opcode = MLX5_IB_WR_UMR; + + umrwr->npages = n; + umrwr->page_shift = page_shift; + umrwr->mkey = key; +} + +static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, + struct ib_sge *sg, u64 dma, int n, u32 key, + int page_shift, u64 virt_addr, u64 len, + int access_flags) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); + + wr->send_flags = 0; + + umrwr->target.virt_addr = virt_addr; + umrwr->length = len; + umrwr->access_flags = access_flags; + umrwr->pd = pd; +} + +static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, + struct ib_send_wr *wr, u32 key) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr->opcode = MLX5_IB_WR_UMR; + umrwr->mkey = key; +} + +static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length, + int access_flags, int *npages, + int *page_shift, int *ncont, int *order) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length, + access_flags, 0); + if (IS_ERR(umem)) { + mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); + return (void *)umem; + } + + mlx5_ib_cont_pages(umem, start, npages, page_shift, ncont, order); + if (!*npages) { + mlx5_ib_warn(dev, "avoid zero region\n"); + ib_umem_release(umem); + return ERR_PTR(-EINVAL); + } + + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", + *npages, *ncont, *order, *page_shift); + + return umem; +} + +static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) +{ + context->cqe.done = mlx5_ib_umr_done; + context->status = -1; + init_completion(&context->done); +} + +static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, + u64 virt_addr, u64 len, int npages, + int page_shift, int order, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct mlx5_umr_wr umrwr = {}; + struct ib_send_wr *bad; struct mlx5_ib_mr *mr; + struct ib_sge sg; + int size; + __be64 *mr_pas; + dma_addr_t dma; + int err = 0; + int i; + + for (i = 0; i < 1; i++) { + mr = alloc_cached_mr(dev, order); + if (mr) + break; + + err = add_keys(dev, order2idx(dev, order), 1); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); + break; + } + } + + if (!mr) + return ERR_PTR(-EAGAIN); + + err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, + &dma); + if (err) + goto free_mr; + + mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; + prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, + page_shift, virt_addr, len, access_flags); + + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr.wr, &bad); + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + goto unmap_dma; + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed\n"); + err = -EFAULT; + } + } + + mr->mmkey.iova = virt_addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; + + mr->live = 1; + +unmap_dma: + up(&umrc->sem); + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + + kfree(mr_pas); + +free_mr: + if (err) { + free_cached_mr(dev, mr); + return ERR_PTR(err); + } + + return mr; +} + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, + int zap) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dma_device; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct ib_umem *umem = mr->umem; + int size; + __be64 *pas; + dma_addr_t dma; + struct ib_send_wr *bad; + struct mlx5_umr_wr wr; + struct ib_sge sg; + int err = 0; + const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); + const int page_index_mask = page_index_alignment - 1; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t pages_iter = 0; + int use_emergency_buf = 0; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly */ + if (start_page_index & page_index_mask) { + npages += start_page_index & page_index_mask; + start_page_index &= ~page_index_mask; + } + + pages_to_map = ALIGN(npages, page_index_alignment); + + if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) + return -EINVAL; + + size = sizeof(u64) * pages_to_map; + size = min_t(int, PAGE_SIZE, size); + /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim + * code, when we are called from an invalidation. The pas buffer must + * be 2k-aligned for Connect-IB. */ + pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); + if (!pas) { + mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); + pas = mlx5_ib_update_mtt_emergency_buffer; + size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; + use_emergency_buf = 1; + mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + memset(pas, 0, size); + } + pages_iter = size / sizeof(u64); + dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + err = -ENOMEM; + goto free_pas; + } + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, start_page_index += pages_iter) { + dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); + + npages = min_t(size_t, + pages_iter, + ib_umem_num_pages(umem) - start_page_index); + + if (!zap) { + __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, + start_page_index, npages, pas, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages brought from the + * umem. */ + memset(pas + npages, 0, size - npages * sizeof(u64)); + } + + dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); + + mlx5_ib_init_umr_context(&umr_context); + + memset(&wr, 0, sizeof(wr)); + wr.wr.wr_cqe = &umr_context.cqe; + + sg.addr = dma; + sg.length = ALIGN(npages * sizeof(u64), + MLX5_UMR_MTT_ALIGNMENT); + sg.lkey = dev->umrc.pd->local_dma_lkey; + + wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | + MLX5_IB_SEND_UMR_UPDATE_MTT; + wr.wr.sg_list = &sg; + wr.wr.num_sge = 1; + wr.wr.opcode = MLX5_IB_WR_UMR; + wr.npages = sg.length / sizeof(u64); + wr.page_shift = PAGE_SHIFT; + wr.mkey = mr->mmkey.key; + wr.target.offset = start_page_index; + + down(&umrc->sem); + err = ib_post_send(umrc->qp, &wr.wr, &bad); + if (err) { + mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_err(dev, "UMR completion failed, code %d\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + } + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + +free_pas: + if (!use_emergency_buf) + free_page((unsigned long)pas); + else + mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + + return err; +} +#endif + +/* + * If ibmr is NULL it will be allocated by reg_create. + * Else, the given ibmr will be used. + */ +static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, + u64 virt_addr, u64 length, + struct ib_umem *umem, int npages, + int page_shift, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + __be64 *pas; + void *mkc; int inlen; + u32 *in; int err; bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2; + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(*pas) * ((npages + 1) / 2) * 2; in = mlx5_vzalloc(inlen); if (!in) { err = -ENOMEM; goto err_1; } - mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, + pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); + mlx5_ib_populate_pas(dev, umem, page_shift, pas, pg_cap ? MLX5_IB_MTT_PRESENT : 0); - /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags + /* The pg_access bit allows setting the access flags * in the page list submitted with the command. */ - in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; - in->seg.flags = convert_access(access_flags) | - MLX5_ACCESS_MODE_MTT; - in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); - in->seg.start_addr = cpu_to_be64(virt_addr); - in->seg.len = cpu_to_be64(length); - in->seg.bsfs_octo_size = 0; - in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - in->seg.log2_page_size = page_shift; - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, - 1 << page_shift)); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, - NULL, NULL); + MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, mkc, lr, 1); + + MLX5_SET64(mkc, mkc, start_addr, virt_addr); + MLX5_SET64(mkc, mkc, len, length); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, bsf_octword_size, 0); + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(virt_addr, length, 1 << page_shift)); + MLX5_SET(mkc, mkc, log_page_size, page_shift); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, + get_octo_len(virt_addr, length, 1 << page_shift)); + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, + (struct mlx5_create_mkey_mbox_in *)in, + inlen, NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; } mr->umem = umem; mr->dev = dev; + mr->live = 1; kvfree(in); - mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); return mr; @@ -590,42 +942,26 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, kvfree(in); err_1: - kfree(mr); + if (!ibmr) + kfree(mr); return ERR_PTR(err); } -enum { - MLX5_MAX_REG_ORDER = MAX_MR_CACHE_ENTRIES + 1, - MLX5_MAX_REG_SIZE = 2ul * 1024 * 1024 * 1024, -}; - -static int clean_mr(struct mlx5_ib_mr *mr) +static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, + int npages, u64 length, int access_flags) { - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); - int umred = mr->umred; - int err; - int i; - - if (!umred) { - for (i = 0; i < mr->nchild; ++i) { - free_cached_mr(dev, mr->children[i]); - } - kfree(mr->children); - - err = destroy_mkey(dev, mr); - if (err) { - mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - mr->mmr.key, err); - return err; - } - } - return 0; + mr->npages = npages; + atomic_add(npages, &dev->mdev->priv.reg_pages); + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + mr->ibmr.length = length; + mr->access_flags = access_flags; } struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, - struct ib_udata *udata, int mr_id) + struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; @@ -637,195 +973,303 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err; mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", - (unsigned long long)start, (unsigned long long)virt_addr, - (unsigned long long)length, access_flags); - umem = ib_umem_get(pd->uobject->context, start, length, access_flags, 0); - if (IS_ERR(umem)) { - mlx5_ib_warn(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); - return (void *)umem; - } + (long long)start, (long long)virt_addr, (long long)length, access_flags); + umem = mr_umem_get(pd, start, length, access_flags, &npages, + &page_shift, &ncont, &order); - mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order); - if (!npages) { - mlx5_ib_warn(dev, "avoid zero region\n"); + if (IS_ERR(umem)) + return (void *)umem; + + if (use_umr(order)) { + mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, + order, access_flags); + if (PTR_ERR(mr) == -EAGAIN) { + mlx5_ib_dbg(dev, "cache empty for order %d", order); + mr = NULL; + } + } else if (access_flags & IB_ACCESS_ON_DEMAND) { err = -EINVAL; + pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); goto error; } - mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", - npages, ncont, order, page_shift); - - mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift, access_flags); - mutex_unlock(&dev->slow_path_mutex); + if (!mr) { + mutex_lock(&dev->slow_path_mutex); + mr = reg_create(NULL, pd, virt_addr, length, umem, ncont, + page_shift, access_flags); + mutex_unlock(&dev->slow_path_mutex); + } if (IS_ERR(mr)) { err = PTR_ERR(mr); - mr = NULL; goto error; } - mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key); + mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); mr->umem = umem; - mr->npages = npages; - atomic_add(npages, &dev->mdev->priv.reg_pages); - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + set_mr_fileds(dev, mr, npages, length, access_flags); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif return &mr->ibmr; error: - /* - * Destroy the umem *before* destroying the MR, to ensure we - * will not have any in-flight notifiers when destroying the - * MR. - * - * As the MR is completely invalid to begin with, and this - * error path is only taken if we can't push the mr entry into - * the pagefault tree, this is safe. - */ - ib_umem_release(umem); return ERR_PTR(err); } -CTASSERT(sizeof(((struct ib_phys_buf *)0)->size) == 8); +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +{ + struct mlx5_core_dev *mdev = dev->mdev; + struct umr_common *umrc = &dev->umrc; + struct mlx5_ib_umr_context umr_context; + struct mlx5_umr_wr umrwr = {}; + struct ib_send_wr *bad; + int err; -struct ib_mr * -mlx5_ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int access_flags, - u64 *virt_addr) + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + mlx5_ib_init_umr_context(&umr_context); + + umrwr.wr.wr_cqe = &umr_context.cqe; + prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); + + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr.wr, &bad); + if (err) { + up(&umrc->sem); + mlx5_ib_dbg(dev, "err %d\n", err); + goto error; + } else { + wait_for_completion(&umr_context.done); + up(&umrc->sem); + } + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "unreg umr failed\n"); + err = -EFAULT; + goto error; + } + return 0; + +error: + return err; +} + +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, + u64 length, int npages, int page_shift, int order, + int access_flags, int flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_create_mkey_mbox_in *in; - struct mlx5_ib_mr *mr; - u64 total_size; - u32 octo_len; - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); - unsigned long mask; - int shift; - int npages; - int inlen; + struct device *ddev = dev->ib_dev.dma_device; + struct mlx5_ib_umr_context umr_context; + struct ib_send_wr *bad; + struct mlx5_umr_wr umrwr = {}; + struct ib_sge sg; + struct umr_common *umrc = &dev->umrc; + dma_addr_t dma = 0; + __be64 *mr_pas = NULL; + int size; int err; - int i, j, n; - mask = buffer_list[0].addr ^ *virt_addr; - total_size = 0; - for (i = 0; i < num_phys_buf; ++i) { - if (i != 0) - mask |= buffer_list[i].addr; - if (i != num_phys_buf - 1) - mask |= buffer_list[i].addr + buffer_list[i].size; + mlx5_ib_init_umr_context(&umr_context); - total_size += buffer_list[i].size; + umrwr.wr.wr_cqe = &umr_context.cqe; + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; + + if (flags & IB_MR_REREG_TRANS) { + err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, + &mr_pas, &dma); + if (err) + return err; + + umrwr.target.virt_addr = virt_addr; + umrwr.length = length; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; } - if (mask & ~PAGE_MASK) - return ERR_PTR(-EINVAL); + prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, + page_shift); - shift = __ffs(mask | 1 << 31); - - buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); - buffer_list[0].addr &= ~0ULL << shift; - - npages = 0; - for (i = 0; i < num_phys_buf; ++i) - npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift; - - if (!npages) { - mlx5_ib_warn(dev, "avoid zero region\n"); - return ERR_PTR(-EINVAL); + if (flags & IB_MR_REREG_PD) { + umrwr.pd = pd; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; } - mr = kzalloc(sizeof *mr, GFP_KERNEL); - if (!mr) - return ERR_PTR(-ENOMEM); - - octo_len = get_octo_len(*virt_addr, total_size, 1ULL << shift); - octo_len = ALIGN(octo_len, 4); - - inlen = sizeof(*in) + (octo_len * 16); - in = mlx5_vzalloc(inlen); - if (!in) { - kfree(mr); - return ERR_PTR(-ENOMEM); + if (flags & IB_MR_REREG_ACCESS) { + umrwr.access_flags = access_flags; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; } - n = 0; - for (i = 0; i < num_phys_buf; ++i) { - for (j = 0; - j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift; - ++j) { - u64 temp = buffer_list[i].addr + ((u64) j << shift); - if (pg_cap) - temp |= MLX5_IB_MTT_PRESENT; - in->pas[n++] = cpu_to_be64(temp); + /* post send request to UMR QP */ + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr.wr, &bad); + + if (err) { + mlx5_ib_warn(dev, "post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; } } - /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags - * in the page list submitted with the command. */ - in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; - in->seg.flags = convert_access(access_flags) | - MLX5_ACCESS_MODE_MTT; - in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); - in->seg.start_addr = cpu_to_be64(*virt_addr); - in->seg.len = cpu_to_be64(total_size); - in->seg.bsfs_octo_size = 0; - in->seg.xlt_oct_size = cpu_to_be32(octo_len); - in->seg.log2_page_size = shift; - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->xlat_oct_act_size = cpu_to_be32(octo_len); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL, - NULL, NULL); - mr->umem = NULL; - mr->dev = dev; - mr->npages = npages; - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; - - kvfree(in); - - if (err) { - kfree(mr); - return ERR_PTR(err); + up(&umrc->sem); + if (flags & IB_MR_REREG_TRANS) { + dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + kfree(mr_pas); } - return &mr->ibmr; + return err; } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, + u64 length, u64 virt_addr, int new_access_flags, + struct ib_pd *new_pd, struct ib_udata *udata) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); - struct ib_umem *umem = mr->umem; - int npages = mr->npages; - int umred = mr->umred; + struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); + struct mlx5_ib_mr *mr = to_mmr(ib_mr); + struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd; + int access_flags = flags & IB_MR_REREG_ACCESS ? + new_access_flags : + mr->access_flags; + u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; + u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; + int page_shift = 0; + int npages = 0; + int ncont = 0; + int order = 0; int err; - err = clean_mr(mr); - if (err) - return err; + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", + (long long)start, (long long)virt_addr, (long long)length, access_flags); - if (umem) { - ib_umem_release(umem); - atomic_sub(npages, &dev->mdev->priv.reg_pages); + if (flags != IB_MR_REREG_PD) { + /* + * Replace umem. This needs to be done whether or not UMR is + * used. + */ + flags |= IB_MR_REREG_TRANS; + ib_umem_release(mr->umem); + mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages, + &page_shift, &ncont, &order); + if (IS_ERR(mr->umem)) { + err = PTR_ERR(mr->umem); + mr->umem = NULL; + return err; + } } - if (umred) - free_cached_mr(dev, mr); - else - kfree(mr); + if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) { + /* + * UMR can't be used - MKey needs to be replaced. + */ + if (mr->umred) { + err = unreg_umr(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to unregister MR\n"); + } else { + err = destroy_mkey(dev, mr); + if (err) + mlx5_ib_warn(dev, "Failed to destroy MKey\n"); + } + if (err) + return err; + + mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont, + page_shift, access_flags); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->umred = 0; + } else { + /* + * Send a UMR WQE + */ + err = rereg_umr(pd, mr, addr, len, npages, page_shift, + order, access_flags, flags); + if (err) { + mlx5_ib_warn(dev, "Failed to rereg UMR\n"); + return err; + } + } + + if (flags & IB_MR_REREG_PD) { + ib_mr->pd = pd; + mr->mmkey.pd = to_mpd(pd)->pdn; + } + + if (flags & IB_MR_REREG_ACCESS) + mr->access_flags = access_flags; + + if (flags & IB_MR_REREG_TRANS) { + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + set_mr_fileds(dev, mr, npages, len, access_flags); + mr->mmkey.iova = addr; + mr->mmkey.size = len; + } +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + update_odp_mr(mr); +#endif return 0; } -int mlx5_ib_destroy_mr(struct ib_mr *ibmr) +static int +mlx5_alloc_priv_descs(struct ib_device *device, + struct mlx5_ib_mr *mr, + int ndescs, + int desc_size) { - struct mlx5_ib_dev *dev = to_mdev(ibmr->device); - struct mlx5_ib_mr *mr = to_mmr(ibmr); + int size = ndescs * desc_size; + int add_size; + int ret; + + add_size = max_t(int, MLX5_UMR_ALIGN - 1, 0); + + mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); + if (!mr->descs_alloc) + return -ENOMEM; + + mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); + + mr->desc_map = dma_map_single(device->dma_device, mr->descs, + size, DMA_TO_DEVICE); + if (dma_mapping_error(device->dma_device, mr->desc_map)) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + kfree(mr->descs_alloc); + + return ret; +} + +static void +mlx5_free_priv_descs(struct mlx5_ib_mr *mr) +{ + if (mr->descs) { + struct ib_device *device = mr->ibmr.device; + int size = mr->max_descs * mr->desc_size; + + dma_unmap_single(device->dma_device, mr->desc_map, + size, DMA_TO_DEVICE); + kfree(mr->descs_alloc); + mr->descs = NULL; + } +} + +static int clean_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + int umred = mr->umred; int err; if (mr->sig) { @@ -838,473 +1282,383 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr) mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", mr->sig->psv_wire.psv_idx); kfree(mr->sig); + mr->sig = NULL; } - err = destroy_mkey(dev, mr); - if (err) { - mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", - mr->mmr.key, err); - return err; + mlx5_free_priv_descs(mr); + + if (!umred) { + err = destroy_mkey(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", + mr->mmkey.key, err); + return err; + } + } else { + err = unreg_umr(dev, mr); + if (err) { + mlx5_ib_warn(dev, "failed unregister\n"); + return err; + } + free_cached_mr(dev, mr); } - kfree(mr); + if (!umred) + kfree(mr); - return err; + return 0; } -struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, - int max_page_list_len) +int mlx5_ib_dereg_mr(struct ib_mr *ibmr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int npages = mr->npages; + struct ib_umem *umem = mr->umem; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (umem && umem->odp_data) { + /* Prevent new page faults from succeeding */ + mr->live = 0; + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); + /* Destroy all page mappings */ + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + /* + * We kill the umem before the MR for ODP, + * so that there will not be any invalidations in + * flight, looking at the *mr struct. + */ + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + + /* Avoid double-freeing the umem. */ + umem = NULL; + } +#endif + + clean_mr(mr); + + if (umem) { + ib_umem_release(umem); + atomic_sub(npages, &dev->mdev->priv.reg_pages); + } + + return 0; +} + +struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_create_mkey_mbox_in *in; + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + int ndescs = ALIGN(max_num_sg, 4); struct mlx5_ib_mr *mr; + void *mkc; + u32 *in; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - in = kzalloc(sizeof(*in), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); if (!in) { err = -ENOMEM; goto err_free; } - in->seg.status = MLX5_MKEY_STATUS_FREE; - in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); - in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; - in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); - /* - * TBD not needed - issue 197292 */ - in->seg.log2_page_size = PAGE_SHIFT; + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); - err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL, - NULL, NULL); - kfree(in); - if (err) { - mlx5_ib_warn(dev, "failed create mkey\n"); - goto err_free; + if (mr_type == IB_MR_TYPE_MEM_REG) { + mr->access_mode = MLX5_ACCESS_MODE_MTT; + MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(u64)); + if (err) + goto err_free_in; + + mr->desc_size = sizeof(u64); + mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SG_GAPS) { + mr->access_mode = MLX5_ACCESS_MODE_KLM; + + err = mlx5_alloc_priv_descs(pd->device, mr, + ndescs, sizeof(struct mlx5_klm)); + if (err) + goto err_free_in; + mr->desc_size = sizeof(struct mlx5_klm); + mr->max_descs = ndescs; + } else if (mr_type == IB_MR_TYPE_SIGNATURE) { + u32 psv_index[2]; + + MLX5_SET(mkc, mkc, bsf_en, 1); + MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); + mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); + if (!mr->sig) { + err = -ENOMEM; + goto err_free_in; + } + + /* create mem & wire PSVs */ + err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, + 2, psv_index); + if (err) + goto err_free_sig; + + mr->access_mode = MLX5_ACCESS_MODE_KLM; + mr->sig->psv_memory.psv_idx = psv_index[0]; + mr->sig->psv_wire.psv_idx = psv_index[1]; + + mr->sig->sig_status_checked = true; + mr->sig->sig_err_exists = false; + /* Next UMR, Arm SIGERR */ + ++mr->sig->sigerr_count; + } else { + mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); + err = -EINVAL; + goto err_free_in; } - mr->ibmr.lkey = mr->mmr.key; - mr->ibmr.rkey = mr->mmr.key; + MLX5_SET(mkc, mkc, access_mode, mr->access_mode); + MLX5_SET(mkc, mkc, umr_en, 1); + + err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, + (struct mlx5_create_mkey_mbox_in *)in, + inlen, NULL, NULL, NULL); + if (err) + goto err_destroy_psv; + + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; + kfree(in); return &mr->ibmr; +err_destroy_psv: + if (mr->sig) { + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_memory.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", + mr->sig->psv_memory.psv_idx); + if (mlx5_core_destroy_psv(dev->mdev, + mr->sig->psv_wire.psv_idx)) + mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", + mr->sig->psv_wire.psv_idx); + } + mlx5_free_priv_descs(mr); +err_free_sig: + kfree(mr->sig); +err_free_in: + kfree(in); err_free: kfree(mr); return ERR_PTR(err); } -struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev, - int page_list_len) +struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, + struct ib_udata *udata) { - struct mlx5_ib_fast_reg_page_list *mfrpl; - int size = page_list_len * sizeof(u64); - - mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL); - if (!mfrpl) - return ERR_PTR(-ENOMEM); - - mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL); - if (!mfrpl->ibfrpl.page_list) - goto err_free; - - mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device, - size, &mfrpl->map, - GFP_KERNEL); - if (!mfrpl->mapped_page_list) - goto err_free; - - WARN_ON(mfrpl->map & 0x3f); - - return &mfrpl->ibfrpl; - -err_free: - kfree(mfrpl->ibfrpl.page_list); - kfree(mfrpl); - return ERR_PTR(-ENOMEM); -} - -void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) -{ - struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list); - struct mlx5_ib_dev *dev = to_mdev(page_list->device); - int size = page_list->max_page_list_len * sizeof(u64); - - dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list, - mfrpl->map); - kfree(mfrpl->ibfrpl.page_list); - kfree(mfrpl); -} - -struct order_attribute { - struct attribute attr; - ssize_t (*show)(struct cache_order *, struct order_attribute *, char *buf); - ssize_t (*store)(struct cache_order *, struct order_attribute *, - const char *buf, size_t count); -}; - -static ssize_t cur_show(struct cache_order *co, struct order_attribute *oa, - char *buf) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_ib_mw *mw = NULL; + u32 *in = NULL; + void *mkc; + int ndescs; int err; + struct mlx5_ib_alloc_mw req = {}; + struct { + __u32 comp_mask; + __u32 response_length; + } resp = {}; - err = snprintf(buf, 20, "%d\n", ent->cur); - return err; -} - -static ssize_t limit_show(struct cache_order *co, struct order_attribute *oa, - char *buf) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; - int err; - - err = snprintf(buf, 20, "%d\n", ent->limit); - return err; -} - -static ssize_t limit_store(struct cache_order *co, struct order_attribute *oa, - const char *buf, size_t count) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; - u32 var; - int err; - -#define kstrtouint(a,b,c) ({*(c) = strtol(a,0,b); 0;}) -#define kstrtoint(a,b,c) ({*(c) = strtol(a,0,b); 0;}) - - if (kstrtouint(buf, 0, &var)) - return -EINVAL; - - if (var > ent->size) - return -EINVAL; - - ent->limit = var; - - if (ent->cur < ent->limit) { - err = add_keys(dev, co->index, 2 * ent->limit - ent->cur); - if (err) - return err; - } - - return count; -} - -static ssize_t miss_show(struct cache_order *co, struct order_attribute *oa, - char *buf) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; - int err; - - err = snprintf(buf, 20, "%d\n", ent->miss); - return err; -} - -static ssize_t miss_store(struct cache_order *co, struct order_attribute *oa, - const char *buf, size_t count) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; - u32 var; - - if (kstrtouint(buf, 0, &var)) - return -EINVAL; - - if (var != 0) - return -EINVAL; - - ent->miss = var; - - return count; -} - -static ssize_t size_show(struct cache_order *co, struct order_attribute *oa, - char *buf) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; - int err; - - err = snprintf(buf, 20, "%d\n", ent->size); - return err; -} - -static ssize_t size_store(struct cache_order *co, struct order_attribute *oa, - const char *buf, size_t count) -{ - struct mlx5_ib_dev *dev = co->dev; - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[co->index]; - u32 var; - int err; - - if (kstrtouint(buf, 0, &var)) - return -EINVAL; - - if (var < ent->limit) - return -EINVAL; - - if (var > ent->size) { - do { - err = add_keys(dev, co->index, var - ent->size); - if (err && err != -EAGAIN) - return err; - - usleep_range(3000, 5000); - } while (err); - } else if (var < ent->size) { - remove_keys(dev, co->index, ent->size - var); - } - - return count; -} - -static ssize_t order_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct order_attribute *oa = - container_of(attr, struct order_attribute, attr); - struct cache_order *co = container_of(kobj, struct cache_order, kobj); - - if (!oa->show) - return -EIO; - - return oa->show(co, oa, buf); -} - -static ssize_t order_attr_store(struct kobject *kobj, - struct attribute *attr, const char *buf, size_t size) -{ - struct order_attribute *oa = - container_of(attr, struct order_attribute, attr); - struct cache_order *co = container_of(kobj, struct cache_order, kobj); - - if (!oa->store) - return -EIO; - - return oa->store(co, oa, buf, size); -} - -static const struct sysfs_ops order_sysfs_ops = { - .show = order_attr_show, - .store = order_attr_store, -}; - -#define ORDER_ATTR(_name) struct order_attribute order_attr_##_name = \ - __ATTR(_name, 0644, _name##_show, _name##_store) -#define ORDER_ATTR_RO(_name) struct order_attribute order_attr_##_name = \ - __ATTR(_name, 0444, _name##_show, NULL) - -static ORDER_ATTR_RO(cur); -static ORDER_ATTR(limit); -static ORDER_ATTR(miss); -static ORDER_ATTR(size); - -static struct attribute *order_default_attrs[] = { - &order_attr_cur.attr, - &order_attr_limit.attr, - &order_attr_miss.attr, - &order_attr_size.attr, - NULL -}; - -static struct kobj_type order_type = { - .sysfs_ops = &order_sysfs_ops, - .default_attrs = order_default_attrs -}; - - - -struct cache_attribute { - struct attribute attr; - ssize_t (*show)(struct mlx5_ib_dev *dev, char *buf); - ssize_t (*store)(struct mlx5_ib_dev *dev, const char *buf, size_t count); -}; - -static ssize_t rel_imm_show(struct mlx5_ib_dev *dev, char *buf) -{ - struct mlx5_mr_cache *cache = &dev->cache; - int err; - - err = snprintf(buf, 20, "%d\n", cache->rel_imm); - return err; -} - -static ssize_t rel_imm_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) -{ - struct mlx5_mr_cache *cache = &dev->cache; - u32 var; - int i; - int found = 0; - - if (kstrtouint(buf, 0, &var)) - return -EINVAL; - - if (var > 1) - return -EINVAL; - - if (var == cache->rel_imm) - return count; - - cache->rel_imm = var; - if (cache->rel_imm == 1) { - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - if (cache->ent[i].cur > 2 * cache->ent[i].limit) { - queue_work(cache->wq, &cache->ent[i].work); - found = 1; - } - } - if (!found) - cache->rel_imm = 0; - } - - return count; -} -static ssize_t rel_timeout_show(struct mlx5_ib_dev *dev, char *buf) -{ - struct mlx5_mr_cache *cache = &dev->cache; - int err; - - err = snprintf(buf, 20, "%d\n", cache->rel_timeout); - return err; -} - -static ssize_t rel_timeout_store(struct mlx5_ib_dev *dev, const char *buf, size_t count) -{ - struct mlx5_mr_cache *cache = &dev->cache; - int var; - int i; - - if (kstrtoint(buf, 0, &var)) - return -EINVAL; - - if (var < -1 || var > MAX_MR_RELEASE_TIMEOUT) - return -EINVAL; - - if (var == cache->rel_timeout) - return count; - - if (cache->rel_timeout == -1 || (var < cache->rel_timeout && var != -1)) { - cache->rel_timeout = var; - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - if (cache->ent[i].cur > 2 * cache->ent[i].limit) - queue_work(cache->wq, &cache->ent[i].work); - } - } else { - cache->rel_timeout = var; - } - - return count; -} - -static ssize_t cache_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct cache_attribute *ca = - container_of(attr, struct cache_attribute, attr); - struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); - - if (!ca->show) - return -EIO; - - return ca->show(dev, buf); -} - -static ssize_t cache_attr_store(struct kobject *kobj, - struct attribute *attr, const char *buf, size_t size) -{ - struct cache_attribute *ca = - container_of(attr, struct cache_attribute, attr); - struct mlx5_ib_dev *dev = container_of(kobj, struct mlx5_ib_dev, mr_cache); - - if (!ca->store) - return -EIO; - - return ca->store(dev, buf, size); -} - -static const struct sysfs_ops cache_sysfs_ops = { - .show = cache_attr_show, - .store = cache_attr_store, -}; - -#define CACHE_ATTR(_name) struct cache_attribute cache_attr_##_name = \ - __ATTR(_name, 0644, _name##_show, _name##_store) - -static CACHE_ATTR(rel_imm); -static CACHE_ATTR(rel_timeout); - -static struct attribute *cache_default_attrs[] = { - &cache_attr_rel_imm.attr, - &cache_attr_rel_timeout.attr, - NULL -}; - -static struct kobj_type cache_type = { - .sysfs_ops = &cache_sysfs_ops, - .default_attrs = cache_default_attrs -}; - -static int mlx5_mr_sysfs_init(struct mlx5_ib_dev *dev) -{ - struct mlx5_mr_cache *cache = &dev->cache; - struct device *device = &dev->ib_dev.dev; - struct cache_order *co; - int o; - int i; - int err; - - err = kobject_init_and_add(&dev->mr_cache, &cache_type, - &device->kobj, "mr_cache"); + err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); if (err) + return ERR_PTR(err); + + if (req.comp_mask || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + udata->inlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + + ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); + + mw = kzalloc(sizeof(*mw), GFP_KERNEL); + in = kzalloc(inlen, GFP_KERNEL); + if (!mw || !in) { + err = -ENOMEM; + goto free; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, ndescs); + MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_KLM); + MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2))); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, + (struct mlx5_create_mkey_mbox_in *)in, + inlen, NULL, NULL, NULL); + if (err) + goto free; + + mw->ibmw.rkey = mw->mmkey.key; + + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); + if (resp.response_length) { + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); + goto free; + } + } + + kfree(in); + return &mw->ibmw; + +free: + kfree(mw); + kfree(in); + return ERR_PTR(err); +} + +int mlx5_ib_dealloc_mw(struct ib_mw *mw) +{ + struct mlx5_ib_mw *mmw = to_mmw(mw); + int err; + + err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, + &mmw->mmkey); + if (!err) + kfree(mmw); + return err; +} + +int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, + struct ib_mr_status *mr_status) +{ + struct mlx5_ib_mr *mmr = to_mmr(ibmr); + int ret = 0; + + if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { + pr_err("Invalid status check mask\n"); + ret = -EINVAL; + goto done; + } + + mr_status->fail_status = 0; + if (check_mask & IB_MR_CHECK_SIG_STATUS) { + if (!mmr->sig) { + ret = -EINVAL; + pr_err("signature status check requested on a non-signature enabled MR\n"); + goto done; + } + + mmr->sig->sig_status_checked = true; + if (!mmr->sig->sig_err_exists) + goto done; + + if (ibmr->lkey == mmr->sig->err_item.key) + memcpy(&mr_status->sig_err, &mmr->sig->err_item, + sizeof(mr_status->sig_err)); + else { + mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; + mr_status->sig_err.sig_err_offset = 0; + mr_status->sig_err.key = mmr->sig->err_item.key; + } + + mmr->sig->sig_err_exists = false; + mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; + } + +done: + return ret; +} + +static int +mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, + struct scatterlist *sgl, + unsigned short sg_nents, + unsigned int *sg_offset_p) +{ + struct scatterlist *sg = sgl; + struct mlx5_klm *klms = mr->descs; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + u32 lkey = mr->ibmr.pd->local_dma_lkey; + int i; + + mr->ibmr.iova = sg_dma_address(sg) + sg_offset; + mr->ibmr.length = 0; + mr->ndescs = sg_nents; + + for_each_sg(sgl, sg, sg_nents, i) { + if (unlikely(i > mr->max_descs)) + break; + klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); + klms[i].key = cpu_to_be32(lkey); + mr->ibmr.length += sg_dma_len(sg); + + sg_offset = 0; + } + + if (sg_offset_p) + *sg_offset_p = sg_offset; + + return i; +} + +static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + __be64 *descs; + + if (unlikely(mr->ndescs == mr->max_descs)) return -ENOMEM; - for (o = 2, i = 0; i < MAX_MR_CACHE_ENTRIES; o++, i++) { - co = &cache->ent[i].co; - co->order = o; - co->index = i; - co->dev = dev; - err = kobject_init_and_add(&co->kobj, &order_type, - &dev->mr_cache, "%d", o); - if (err) - goto err_put; - } + descs = mr->descs; + descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); return 0; - -err_put: - for (; i >= 0; i--) { - co = &cache->ent[i].co; - kobject_put(&co->kobj); - } - kobject_put(&dev->mr_cache); - - return err; } -static void mlx5_mr_sysfs_cleanup(struct mlx5_ib_dev *dev) +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { - struct mlx5_mr_cache *cache = &dev->cache; - struct cache_order *co; - int i; + struct mlx5_ib_mr *mr = to_mmr(ibmr); + int n; - for (i = MAX_MR_CACHE_ENTRIES - 1; i >= 0; i--) { - co = &cache->ent[i].co; - kobject_put(&co->kobj); - } - kobject_put(&dev->mr_cache); + mr->ndescs = 0; + + ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, + mr->desc_size * mr->max_descs, + DMA_TO_DEVICE); + + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); + else + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + mlx5_set_page); + + ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, + mr->desc_size * mr->max_descs, + DMA_TO_DEVICE); + + return n; } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c index 6943a2c87c34..a651a0856827 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c @@ -26,22 +26,13 @@ */ #include -#include #include +#include +#include #include "mlx5_ib.h" -#include "user.h" -#include -#include - -#define IPV6_DEFAULT_HOPLIMIT 64 - - -static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, int attr_mask, - enum ib_qp_state cur_state, enum ib_qp_state new_state); /* not supported currently */ -static int workqueue_signature; +static int wq_signature; enum { MLX5_IB_ACK_REQ_FREQ = 8, @@ -56,11 +47,11 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, - MLX5_IB_CACHE_LINE_SIZE = 64, }; static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, @@ -69,31 +60,36 @@ static const u32 mlx5_ib_opcode[] = { [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, - [IB_WR_FAST_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_REG_MR] = MLX5_OPCODE_UMR, [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; -struct umr_wr { - u64 virt_addr; - struct ib_pd *pd; - unsigned int page_shift; - unsigned int npages; - u32 length; - int access_flags; - u32 mkey; +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; }; +enum raw_qp_set_mask_map { + MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID = 1UL << 0, +}; + +struct mlx5_modify_raw_qp_param { + u16 operation; + + u32 set_mask; /* raw_qp_set_mask_map */ + u8 rq_q_ctr_id; +}; + +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq); + static int is_qp0(enum ib_qp_type qp_type) { return qp_type == IB_QPT_SMI; } -static int is_qp1(enum ib_qp_type qp_type) -{ - return qp_type == IB_QPT_GSI; -} - static int is_sqp(enum ib_qp_type qp_type) { return is_qp0(qp_type) || is_qp1(qp_type); @@ -114,90 +110,76 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); } - -static int -query_wqe_idx(struct mlx5_ib_qp *qp) +/** + * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. + * + * @qp: QP to copy from. + * @send: copy from the send queue when non-zero, use the receive queue + * otherwise. + * @wqe_index: index to start copying from. For send work queues, the + * wqe_index is in units of MLX5_SEND_WQE_BB. + * For receive work queue, it is the number of work queue + * element in the queue. + * @buffer: destination buffer. + * @length: maximum number of bytes to copy. + * + * Copies at least a single WQE, but may copy more data. + * + * Return: the number of bytes copied, or an error code. + */ +int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, + void *buffer, u32 length, + struct mlx5_ib_qp_base *base) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); - struct mlx5_query_qp_mbox_out *outb; - struct mlx5_qp_context *context; + struct ib_device *ibdev = qp->ibqp.device; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; + size_t offset; + size_t wq_end; + struct ib_umem *umem = base->ubuffer.umem; + u32 first_copy_length; + int wqe_length; int ret; - outb = kzalloc(sizeof(*outb), GFP_KERNEL); - if (!outb) - return -ENOMEM; + if (wq->wqe_cnt == 0) { + mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", + qp->ibqp.qp_type); + return -EINVAL; + } - context = &outb->ctx; + offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); + wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); - mutex_lock(&qp->mutex); - ret = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) + return -EINVAL; + + if (offset > umem->length || + (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) + return -EINVAL; + + first_copy_length = min_t(u32, offset + length, wq_end) - offset; + ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); if (ret) - goto out_free; + return ret; - ret = be16_to_cpu(context->hw_sq_wqe_counter) & (qp->sq.wqe_cnt - 1); + if (send) { + struct mlx5_wqe_ctrl_seg *ctrl = buffer; + int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; -out_free: - mutex_unlock(&qp->mutex); - kfree(outb); - - return ret; -} - -static int mlx5_handle_sig_pipelining(struct mlx5_ib_qp *qp) -{ - int wqe_idx; - - wqe_idx = query_wqe_idx(qp); - if (wqe_idx < 0) { - printf("mlx5_ib: ERR: ""Failed to query QP 0x%x wqe index\n", qp->mqp.qpn); - return wqe_idx; + wqe_length = ds * MLX5_WQE_DS_UNITS; + } else { + wqe_length = 1 << wq->wqe_shift; } - if (qp->sq.swr_ctx[wqe_idx].sig_piped) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); - struct mlx5_wqe_ctrl_seg *cwqe; + if (wqe_length <= first_copy_length) + return first_copy_length; - cwqe = mlx5_get_send_wqe(qp, wqe_idx); - cwqe->opmod_idx_opcode = cpu_to_be32(be32_to_cpu(cwqe->opmod_idx_opcode) & 0xffffff00); - qp->sq.swr_ctx[wqe_idx].w_list.opcode |= MLX5_OPCODE_SIGNATURE_CANCELED; - mlx5_ib_dbg(dev, "Cancel QP 0x%x wqe_index 0x%x\n", - qp->mqp.qpn, wqe_idx); - } + ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, + wqe_length - first_copy_length); + if (ret) + return ret; - return 0; -} - -static void mlx5_ib_sqd_work(struct work_struct *work) -{ - struct mlx5_ib_sqd *sqd; - struct mlx5_ib_qp *qp; - struct ib_qp_attr qp_attr; - - sqd = container_of(work, struct mlx5_ib_sqd, work); - qp = sqd->qp; - - if (mlx5_handle_sig_pipelining(qp)) - goto out; - - mutex_lock(&qp->mutex); - if (__mlx5_ib_modify_qp(&qp->ibqp, &qp_attr, 0, IB_QPS_SQD, IB_QPS_RTS)) - printf("mlx5_ib: ERR: ""Failed to resume QP 0x%x\n", qp->mqp.qpn); - mutex_unlock(&qp->mutex); -out: - kfree(sqd); -} - -static void mlx5_ib_sigerr_sqd_event(struct mlx5_ib_qp *qp) -{ - struct mlx5_ib_sqd *sqd; - - sqd = kzalloc(sizeof(*sqd), GFP_ATOMIC); - if (!sqd) - return; - - sqd->qp = qp; - INIT_WORK(&sqd->work, mlx5_ib_sqd_work); - queue_work(mlx5_ib_wq, &sqd->work); + return wqe_length; } static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) @@ -205,15 +187,11 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; struct ib_event event; - if (type == MLX5_EVENT_TYPE_SQ_DRAINED && - to_mibqp(qp)->state != IB_QPS_SQD) { - mlx5_ib_sigerr_sqd_event(to_mibqp(qp)); - return; + if (type == MLX5_EVENT_TYPE_PATH_MIG) { + /* This event is only valid for trans_qps */ + to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; } - if (type == MLX5_EVENT_TYPE_PATH_MIG) - to_mibqp(qp)->port = to_mibqp(qp)->alt_port; - if (ibqp->event_handler) { event.device = ibqp->device; event.element.qp = ibqp; @@ -243,7 +221,7 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) event.event = IB_EVENT_QP_ACCESS_ERR; break; default: - printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); + pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn); return; } @@ -296,20 +274,20 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, return 0; } -static int sq_overhead(enum ib_qp_type qp_type) +static int sq_overhead(struct ib_qp_init_attr *attr) { int size = 0; - switch (qp_type) { + switch (attr->qp_type) { case IB_QPT_XRC_INI: size += sizeof(struct mlx5_wqe_xrc_seg); /* fall through */ case IB_QPT_RC: size += sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_atomic_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_umr_ctrl_seg) + - sizeof(struct mlx5_mkey_seg); + max(sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg)); break; case IB_QPT_XRC_TGT: @@ -317,18 +295,28 @@ static int sq_overhead(enum ib_qp_type qp_type) case IB_QPT_UC: size += sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_umr_ctrl_seg) + - sizeof(struct mlx5_mkey_seg); + max(sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg)); break; case IB_QPT_UD: + if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + size += sizeof(struct mlx5_wqe_eth_pad) + + sizeof(struct mlx5_wqe_eth_seg); + /* fall through */ case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX5_IB_QPT_HW_GSI: size += sizeof(struct mlx5_wqe_ctrl_seg) + sizeof(struct mlx5_wqe_datagram_seg); break; + case MLX5_IB_QPT_REG_UMR: + size += sizeof(struct mlx5_wqe_ctrl_seg) + + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg); + break; + default: return -EINVAL; } @@ -341,7 +329,7 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) int inl_size = 0; int size; - size = sq_overhead(attr->qp_type); + size = sq_overhead(attr); if (size < 0) return size; @@ -351,30 +339,11 @@ static int calc_send_wqe(struct ib_qp_init_attr *attr) } size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg); - return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); -} - -static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size) -{ - int max_sge; - - if (attr->qp_type == IB_QPT_RC) - max_sge = (min_t(int, wqe_size, 512) - - sizeof(struct mlx5_wqe_ctrl_seg) - - sizeof(struct mlx5_wqe_raddr_seg)) / - sizeof(struct mlx5_wqe_data_seg); - else if (attr->qp_type == IB_QPT_XRC_INI) - max_sge = (min_t(int, wqe_size, 512) - - sizeof(struct mlx5_wqe_ctrl_seg) - - sizeof(struct mlx5_wqe_xrc_seg) - - sizeof(struct mlx5_wqe_raddr_seg)) / - sizeof(struct mlx5_wqe_data_seg); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN && + ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE) + return MLX5_SIG_WQE_SIZE; else - max_sge = (wqe_size - sq_overhead(attr->qp_type)) / - sizeof(struct mlx5_wqe_data_seg); - - return min_t(int, max_sge, wqe_size - sq_overhead(attr->qp_type) / - sizeof(struct mlx5_wqe_data_seg)); + return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB); } static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, @@ -392,32 +361,28 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, return wqe_size; if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) { - mlx5_ib_warn(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", - wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); + mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n", + wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)); return -EINVAL; } - qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) - - sizeof(struct mlx5_wqe_inline_seg); + qp->max_inline_data = wqe_size - sq_overhead(attr) - + sizeof(struct mlx5_wqe_inline_seg); attr->cap.max_inline_data = qp->max_inline_data; - wq_size = roundup_pow_of_two(attr->cap.max_send_wr * (u64)wqe_size); + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + qp->signature_en = true; + + wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size); qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB; if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) { - mlx5_ib_warn(dev, "wqe count(%d) exceeds limits(%d)\n", - qp->sq.wqe_cnt, - 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); + mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n", + qp->sq.wqe_cnt, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)); return -ENOMEM; } qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); - qp->sq.max_gs = get_send_sge(attr, wqe_size); - if (qp->sq.max_gs < attr->cap.max_send_sge) { - mlx5_ib_warn(dev, "max sge(%d) exceeds limits(%d)\n", - qp->sq.max_gs, attr->cap.max_send_sge); - return -ENOMEM; - } - - attr->cap.max_send_sge = qp->sq.max_gs; + qp->sq.max_gs = attr->cap.max_send_sge; qp->sq.max_post = wq_size / wqe_size; attr->cap.max_send_wr = qp->sq.max_post; @@ -427,6 +392,7 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd, + struct mlx5_ib_qp_base *base, struct ib_qp_init_attr *attr) { int desc_sz = 1 << qp->sq.wqe_shift; @@ -452,14 +418,12 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, return -EINVAL; } - if (attr->qp_type == IB_QPT_RAW_PACKET) { - qp->buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; - qp->sq_buf_size = qp->sq.wqe_cnt << 6; + base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; } else { - qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + - (qp->sq.wqe_cnt << 6); - qp->sq_buf_size = 0; + base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); } return 0; @@ -469,6 +433,7 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT || attr->srq || + attr->qp_type == MLX5_IB_QPT_REG_UMR || !attr->cap.max_recv_wr) return 0; @@ -547,7 +512,6 @@ static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) } uuari->count[minidx]++; - return minidx; } @@ -640,10 +604,11 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_RC: return MLX5_QP_ST_RC; case IB_QPT_UC: return MLX5_QP_ST_UC; case IB_QPT_UD: return MLX5_QP_ST_UD; + case MLX5_IB_QPT_REG_UMR: return MLX5_QP_ST_REG_UMR; case IB_QPT_XRC_INI: case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; - case IB_QPT_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; @@ -662,48 +627,140 @@ static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; } +static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, + struct ib_pd *pd, + unsigned long addr, size_t size, + struct ib_umem **umem, + int *npages, int *page_shift, int *ncont, + u32 *offset) +{ + int err; + + *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0); + if (IS_ERR(*umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + return PTR_ERR(*umem); + } + + mlx5_ib_cont_pages(*umem, addr, npages, page_shift, ncont, NULL); + + err = mlx5_ib_get_buf_offset(addr, *page_shift, offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n", + addr, size, *npages, *page_shift, *ncont, *offset); + + return 0; + +err_umem: + ib_umem_release(*umem); + *umem = NULL; + + return err; +} + +static void destroy_user_rq(struct ib_pd *pd, struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_ucontext *context; + + context = to_mucontext(pd->uobject->context); + mlx5_ib_db_unmap_user(context, &rwq->db); + if (rwq->umem) + ib_umem_release(rwq->umem); +} + +static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_rwq *rwq, + struct mlx5_ib_create_wq *ucmd) +{ + struct mlx5_ib_ucontext *context; + int page_shift = 0; + int npages; + u32 offset = 0; + int ncont = 0; + int err; + + if (!ucmd->buf_addr) + return -EINVAL; + + context = to_mucontext(pd->uobject->context); + rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, + rwq->buf_size, 0, 0); + if (IS_ERR(rwq->umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + err = PTR_ERR(rwq->umem); + return err; + } + + mlx5_ib_cont_pages(rwq->umem, ucmd->buf_addr, &npages, &page_shift, + &ncont, NULL); + err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, + &rwq->rq_page_offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + rwq->rq_num_pas = ncont; + rwq->page_shift = page_shift; + rwq->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + rwq->wq_sig = !!(ucmd->flags & MLX5_WQ_FLAG_SIGNATURE); + + mlx5_ib_dbg(dev, "addr 0x%llx, size %zd, npages %d, page_shift %d, ncont %d, offset %d\n", + (unsigned long long)ucmd->buf_addr, rwq->buf_size, + npages, page_shift, ncont, offset); + + err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db); + if (err) { + mlx5_ib_dbg(dev, "map failed\n"); + goto err_umem; + } + + rwq->create_type = MLX5_WQ_USER; + return 0; + +err_umem: + ib_umem_release(rwq->umem); + return err; +} + static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, struct ib_qp_init_attr *attr, - struct mlx5_create_qp_mbox_in **in, - int *inlen, - struct mlx5_exp_ib_create_qp *ucmd) + u32 **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen, + struct mlx5_ib_qp_base *base) { - struct mlx5_exp_ib_create_qp_resp resp; struct mlx5_ib_ucontext *context; + struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; int uar_index; int npages; u32 offset = 0; int uuarn; int ncont = 0; + __be64 *pas; + void *qpc; int err; + err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return err; + } + context = to_mucontext(pd->uobject->context); - memset(&resp, 0, sizeof(resp)); - resp.size_of_prefix = offsetof(struct mlx5_exp_ib_create_qp_resp, prefix_reserved); /* * TBD: should come from the verbs when we have the API */ - if (ucmd->exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX) { - if (ucmd->exp.wc_uar_index == MLX5_EXP_CREATE_QP_DB_ONLY_UUAR) { - /* Assign LATENCY_CLASS_LOW (DB only UUAR) to this QP */ - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); - if (uuarn < 0) { - mlx5_ib_warn(dev, "DB only uuar allocation failed\n"); - return uuarn; - } - uar_index = uuarn_to_uar_index(&context->uuari, uuarn); - } else if (ucmd->exp.wc_uar_index >= MLX5_IB_MAX_CTX_DYNAMIC_UARS || - context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index] == - MLX5_IB_INVALID_UAR_INDEX) { - mlx5_ib_warn(dev, "dynamic uuar allocation failed\n"); - return -EINVAL; - } else { - uar_index = context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index]; - uuarn = MLX5_EXP_INVALID_UUAR; - } - } else { + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + /* In CROSS_CHANNEL CQ and QP must use the same UAR */ + uuarn = MLX5_CROSS_CHANNEL_UUAR; + else { uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); if (uuarn < 0) { mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); @@ -719,68 +776,61 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } } } - uar_index = uuarn_to_uar_index(&context->uuari, uuarn); } + + uar_index = uuarn_to_uar_index(&context->uuari, uuarn); mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); qp->rq.offset = 0; qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - err = set_user_buf_size(dev, qp, (struct mlx5_ib_create_qp *)ucmd, attr); + err = set_user_buf_size(dev, qp, &ucmd, base, attr); if (err) goto err_uuar; - if (ucmd->buf_addr && qp->buf_size) { - qp->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr, - qp->buf_size, 0, 0); - if (IS_ERR(qp->umem)) { - mlx5_ib_warn(dev, "umem_get failed\n"); - err = PTR_ERR(qp->umem); + if (ucmd.buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd.buf_addr; + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, + ubuffer->buf_size, + &ubuffer->umem, &npages, &page_shift, + &ncont, &offset); + if (err) goto err_uuar; - } } else { - qp->umem = NULL; + ubuffer->umem = NULL; } - if (qp->umem) { - mlx5_ib_cont_pages(qp->umem, ucmd->buf_addr, &npages, &page_shift, - &ncont, NULL); - err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, &offset); - if (err) { - mlx5_ib_warn(dev, "bad offset\n"); - goto err_umem; - } - mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", - (unsigned long long)ucmd->buf_addr, qp->buf_size, - npages, page_shift, ncont, offset); - } - - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * ncont; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_umem; } - if (qp->umem) - mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); - (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); - (*in)->ctx.params2 = cpu_to_be32(offset << 6); - (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - resp.uuar_index = uuarn; + pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); + if (ubuffer->umem) + mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0); + + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + + MLX5_SET(qpc, qpc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, page_offset, offset); + + MLX5_SET(qpc, qpc, uar_page, uar_index); + resp->uuar_index = uuarn; qp->uuarn = uuarn; - err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db); + err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); if (err) { - mlx5_ib_warn(dev, "map failed\n"); + mlx5_ib_dbg(dev, "map failed\n"); goto err_free; } - err = ib_copy_to_udata(udata, &resp, sizeof(struct mlx5_ib_create_qp_resp)); + err = ib_copy_to_udata(udata, resp, sizeof(*resp)); if (err) { - mlx5_ib_err(dev, "copy failed\n"); + mlx5_ib_dbg(dev, "copy failed\n"); goto err_unmap; } qp->create_type = MLX5_QP_USER; @@ -794,50 +844,52 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, kvfree(*in); err_umem: - if (qp->umem) - ib_umem_release(qp->umem); + if (ubuffer->umem) + ib_umem_release(ubuffer->umem); err_uuar: free_uuar(&context->uuari, uuarn); return err; } -static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp, + struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &qp->db); - if (qp->umem) - ib_umem_release(qp->umem); - if (qp->sq_umem) - ib_umem_release(qp->sq_umem); - /* - * Free only the UUARs handled by the kernel. - * UUARs of UARs allocated dynamically are handled by user. - */ - if (qp->uuarn != MLX5_EXP_INVALID_UUAR) - free_uuar(&context->uuari, qp->uuarn); + if (base->ubuffer.umem) + ib_umem_release(base->ubuffer.umem); + free_uuar(&context->uuari, qp->uuarn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, - struct mlx5_create_qp_mbox_in **in, int *inlen) + u32 **in, int *inlen, + struct mlx5_ib_qp_base *base) { enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; struct mlx5_uuar_info *uuari; int uar_index; + void *qpc; int uuarn; int err; uuari = &dev->mdev->priv.uuari; - if (init_attr->create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_IPOIB_UD_LSO | + mlx5_ib_create_qp_sqpn_qp1())) return -EINVAL; + if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) + lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; + uuarn = alloc_uuar(uuari, lc); if (uuarn < 0) { - mlx5_ib_warn(dev, "\n"); + mlx5_ib_dbg(dev, "\n"); return -ENOMEM; } @@ -846,47 +898,60 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, err = calc_sq_size(dev, init_attr, qp); if (err < 0) { - mlx5_ib_warn(dev, "err %d\n", err); + mlx5_ib_dbg(dev, "err %d\n", err); goto err_uuar; } qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); - err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf); + err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, + 2 * PAGE_SIZE, &qp->buf); if (err) { - mlx5_ib_warn(dev, "err %d\n", err); + mlx5_ib_dbg(dev, "err %d\n", err); goto err_uuar; } qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages; + *inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * qp->buf.npages; *in = mlx5_vzalloc(*inlen); if (!*in) { err = -ENOMEM; goto err_buf; } - (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); - /* Set "fast registration enabled" for all kernel QPs */ - (*in)->ctx.params1 |= cpu_to_be32(1 << 11); - (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); - mlx5_fill_page_array(&qp->buf, (*in)->pas); + qpc = MLX5_ADDR_OF(create_qp_in, *in, qpc); + MLX5_SET(qpc, qpc, uar_page, uar_index); + MLX5_SET(qpc, qpc, log_page_size, qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + + /* Set "fast registration enabled" for all kernel QPs */ + MLX5_SET(qpc, qpc, fre, 1); + MLX5_SET(qpc, qpc, rlky, 1); + + if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) { + MLX5_SET(qpc, qpc, deth_sqpn, 1); + qp->flags |= MLX5_IB_QP_SQPN_QP1; + } + + mlx5_fill_page_array(&qp->buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas)); err = mlx5_db_alloc(dev->mdev, &qp->db); if (err) { - mlx5_ib_warn(dev, "err %d\n", err); + mlx5_ib_dbg(dev, "err %d\n", err); goto err_free; } - qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx), - GFP_KERNEL); - qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->rq.rwr_ctx), - GFP_KERNEL); - if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) { + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL); + qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL); + qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL); + qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL); + + if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid || + !qp->sq.w_list || !qp->sq.wqe_head) { err = -ENOMEM; goto err_wrid; } @@ -896,8 +961,11 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, err_wrid: mlx5_db_free(dev->mdev, &qp->db); - kfree(qp->sq.swr_ctx); - kfree(qp->rq.rwr_ctx); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); err_free: kvfree(*in); @@ -913,22 +981,24 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { mlx5_db_free(dev->mdev, &qp->db); - kfree(qp->sq.swr_ctx); - kfree(qp->rq.rwr_ctx); + kfree(qp->sq.wqe_head); + kfree(qp->sq.w_list); + kfree(qp->sq.wrid); + kfree(qp->sq.wr_data); + kfree(qp->rq.wrid); mlx5_buf_free(dev->mdev, &qp->buf); free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); } -static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) +static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { - enum ib_qp_type qt = attr->qp_type; - - if (attr->srq || (qt == IB_QPT_XRC_TGT) || (qt == IB_QPT_XRC_INI)) - return cpu_to_be32(MLX5_SRQ_RQ); + if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == IB_QPT_XRC_INI)) + return MLX5_SRQ_RQ; else if (!qp->has_rq) - return cpu_to_be32(MLX5_ZERO_LEN_RQ); + return MLX5_ZERO_LEN_RQ; else - return cpu_to_be32(MLX5_NON_ZERO_RQ); + return MLX5_NON_ZERO_RQ; } static int is_connected(enum ib_qp_type qp_type) @@ -939,109 +1009,574 @@ static int is_connected(enum ib_qp_type qp_type) return 0; } -static void get_cqs(enum ib_qp_type qp_type, - struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, - struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, u32 tdn) { - switch (qp_type) { - case IB_QPT_XRC_TGT: - *send_cq = NULL; - *recv_cq = NULL; - break; - case IB_QPT_XRC_INI: - *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; - *recv_cq = NULL; - break; + u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_RC: - case IB_QPT_UC: - case IB_QPT_UD: - case IB_QPT_RAW_IPV6: - case IB_QPT_RAW_ETHERTYPE: - case IB_QPT_RAW_PACKET: - *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; - *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; - break; + MLX5_SET(tisc, tisc, transport_domain, tdn); + return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); +} - case IB_QPT_MAX: - default: - *send_cq = NULL; - *recv_cq = NULL; - break; +static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_tis(dev->mdev, sq->tisn); +} + +static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, void *qpin, + struct ib_pd *pd) +{ + struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer; + __be64 *pas; + void *in; + void *sqc; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + void *wq; + int inlen; + int err; + int page_shift = 0; + int npages; + int ncont = 0; + u32 offset = 0; + + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, + &sq->ubuffer.umem, &npages, &page_shift, + &ncont, &offset); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_umem; + } + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); + MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, page_offset, offset); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); + + err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); + + kvfree(in); + + if (err) + goto err_umem; + + return 0; + +err_umem: + ib_umem_release(sq->ubuffer.umem); + sq->ubuffer.umem = NULL; + + return err; +} + +static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + ib_umem_release(sq->ubuffer.umem); +} + +static int get_rq_pas_size(void *qpc) +{ + u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12; + u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride); + u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size); + u32 page_offset = MLX5_GET(qpc, qpc, page_offset); + u32 po_quanta = 1 << (log_page_size - 6); + u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride); + u32 page_size = 1 << log_page_size; + u32 rq_sz_po = rq_sz + (page_offset * po_quanta); + u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size; + + return rq_num_pas * sizeof(u64); +} + +static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, void *qpin) +{ + struct mlx5_ib_qp *mqp = rq->base.container_mibqp; + __be64 *pas; + __be64 *qp_pas; + void *in; + void *rqc; + void *wq; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + int inlen; + int err; + u32 rq_pas_size = get_rq_pas_size(qpc); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, vlan_strip_disable, 1); + MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + + if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) + MLX5_SET(rqc, rqc, scatter_fcs, 1); + + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, end_padding_mode, + MLX5_GET(qpc, qpc, end_padding_mode)); + MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset)); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); + MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); + memcpy(pas, qp_pas, rq_pas_size); + + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); +} + +static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, u32 tdn) +{ + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_tir(dev->mdev, rq->tirn); +} + +static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u32 *in, + struct ib_pd *pd) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + int err; + u32 tdn = mucontext->tdn; + + if (qp->sq.wqe_cnt) { + err = create_raw_packet_qp_tis(dev, sq, tdn); + if (err) + return err; + + err = create_raw_packet_qp_sq(dev, sq, in, pd); + if (err) + goto err_destroy_tis; + + sq->base.container_mibqp = qp; + } + + if (qp->rq.wqe_cnt) { + rq->base.container_mibqp = qp; + + err = create_raw_packet_qp_rq(dev, rq, in); + if (err) + goto err_destroy_sq; + + + err = create_raw_packet_qp_tir(dev, rq, tdn); + if (err) + goto err_destroy_rq; + } + + qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : + rq->base.mqp.qpn; + + return 0; + +err_destroy_rq: + destroy_raw_packet_qp_rq(dev, rq); +err_destroy_sq: + if (!qp->sq.wqe_cnt) + return err; + destroy_raw_packet_qp_sq(dev, sq); +err_destroy_tis: + destroy_raw_packet_qp_tis(dev, sq); + + return err; +} + +static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + if (qp->rq.wqe_cnt) { + destroy_raw_packet_qp_tir(dev, rq); + destroy_raw_packet_qp_rq(dev, rq); + } + + if (qp->sq.wqe_cnt) { + destroy_raw_packet_qp_sq(dev, sq); + destroy_raw_packet_qp_tis(dev, sq); } } -enum { - MLX5_QP_END_PAD_MODE_ALIGN = MLX5_WQ_END_PAD_MODE_ALIGN, - MLX5_QP_END_PAD_MODE_NONE = MLX5_WQ_END_PAD_MODE_NONE, -}; +static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, + struct mlx5_ib_raw_packet_qp *raw_packet_qp) +{ + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + sq->sq = &qp->sq; + rq->rq = &qp->rq; + sq->doorbell = &qp->db; + rq->doorbell = &qp->db; +} + +static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) +{ + mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); +} + +static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + struct mlx5_ib_create_qp_resp resp = {}; + int inlen; + int err; + u32 *in; + void *tirc; + void *hfso; + u32 selected_fields = 0; + size_t min_resp_len; + u32 tdn = mucontext->tdn; + struct mlx5_ib_create_qp_rss ucmd = {}; + size_t required_cmd_sz; + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + return -EOPNOTSUPP; + + if (init_attr->create_flags || init_attr->send_cq) + return -EINVAL; + + min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index); + if (udata->outlen < min_resp_len) + return -EINVAL; + + required_cmd_sz = offsetof(typeof(ucmd), reserved1) + sizeof(ucmd.reserved1); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } + + if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)) || ucmd.reserved1) { + mlx5_ib_dbg(dev, "invalid reserved\n"); + return -EOPNOTSUPP; + } + + err = ib_copy_to_udata(udata, &resp, min_resp_len); + if (err) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EINVAL; + } + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); + MLX5_SET(tirc, tirc, disp_type, + MLX5_TIRC_DISP_TYPE_INDIRECT); + MLX5_SET(tirc, tirc, indirect_table, + init_attr->rwq_ind_tbl->ind_tbl_num); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); + switch (ucmd.rx_hash_function) { + case MLX5_RX_HASH_FUNC_TOEPLITZ: + { + void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); + size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); + + if (len != ucmd.rx_key_len) { + err = -EINVAL; + goto err; + } + + MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FUNC_TOEPLITZ); + MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); + memcpy(rss_key, ucmd.rx_hash_key, len); + break; + } + default: + err = -EOPNOTSUPP; + goto err; + } + + if (!ucmd.rx_hash_fields_mask) { + /* special case when this TIR serves as steering entry without hashing */ + if (!init_attr->rwq_ind_tbl->log_ind_tbl_size) + goto create_tir; + err = -EINVAL; + goto err; + } + + if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { + err = -EINVAL; + goto err; + } + + /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV4); + else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, + MLX5_L3_PROT_TYPE_IPV6); + + if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) && + ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))) { + err = -EINVAL; + goto err; + } + + /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_TCP); + else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, + MLX5_L4_PROT_TYPE_UDP); + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; + + if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || + (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; + + MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); + +create_tir: + err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); + + if (err) + goto err; + + kvfree(in); + /* qpn is reserved for that QP */ + qp->trans_qp.base.mqp.qpn = 0; + qp->flags |= MLX5_IB_QP_RSS; + return 0; + +err: + kvfree(in); + return err; +} static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_create_qp_mbox_in *in = NULL; - struct mlx5_exp_ib_create_qp ucmd; - struct mlx5_ib_create_qp *pucmd = NULL; + struct mlx5_ib_create_qp_resp resp; struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; - int inlen = sizeof(*in); - size_t ucmd_size; - int err; - int st; - u32 uidx; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_qp_base *base; void *qpc; + u32 *in; + int err; + + base = init_attr->qp_type == IB_QPT_RAW_PACKET ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + mlx5_ib_odp_create_qp(qp); mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); + if (init_attr->rwq_ind_tbl) { + if (!udata) + return -ENOSYS; + + err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); + return err; + } + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { - mlx5_ib_warn(dev, "block multicast loopback isn't supported\n"); + mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); return -EINVAL; } else { qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; } } + if (init_attr->create_flags & + (IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { + if (!MLX5_CAP_GEN(mdev, cd)) { + mlx5_ib_dbg(dev, "cross-channel isn't supported\n"); + return -EINVAL; + } + if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + qp->flags |= MLX5_IB_QP_CROSS_CHANNEL; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) + qp->flags |= MLX5_IB_QP_MANAGED_SEND; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) + qp->flags |= MLX5_IB_QP_MANAGED_RECV; + } + + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) + if (!MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) { + mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); + return -EOPNOTSUPP; + } + + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); + return -EOPNOTSUPP; + } + if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || + !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; if (pd && pd->uobject) { - memset(&ucmd, 0, sizeof(ucmd)); - ucmd_size = sizeof(struct mlx5_ib_create_qp); - if (ucmd_size > offsetof(struct mlx5_exp_ib_create_qp, size_of_prefix)) { - mlx5_ib_warn(dev, "mlx5_ib_create_qp is too big to fit as prefix of mlx5_exp_ib_create_qp\n"); - return -EINVAL; + if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; } - err = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, ucmd_size)); - if (err) { - mlx5_ib_err(dev, "copy failed\n"); + + err = get_qp_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) return err; - } - pucmd = (struct mlx5_ib_create_qp *)&ucmd; - if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_UIDX) - uidx = ucmd.exp.uidx; - else - uidx = 0xffffff; qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); } else { - qp->wq_sig = !!workqueue_signature; - uidx = 0xffffff; + qp->wq_sig = !!wq_signature; } qp->has_rq = qp_has_rq(init_attr); err = set_rq_size(dev, &init_attr->cap, qp->has_rq, - qp, (pd && pd->uobject) ? pucmd : NULL); + qp, (pd && pd->uobject) ? &ucmd : NULL); if (err) { - mlx5_ib_warn(dev, "err %d\n", err); + mlx5_ib_dbg(dev, "err %d\n", err); return err; } @@ -1052,34 +1587,34 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || ucmd.rq_wqe_count != qp->rq.wqe_cnt) { - mlx5_ib_warn(dev, "invalid rq params\n"); + mlx5_ib_dbg(dev, "invalid rq params\n"); return -EINVAL; } if (ucmd.sq_wqe_count > max_wqes) { - mlx5_ib_warn(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", - ucmd.sq_wqe_count, max_wqes); + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd.sq_wqe_count, max_wqes); + return -EINVAL; + } + if (init_attr->create_flags & + mlx5_ib_create_qp_sqpn_qp1()) { + mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); return -EINVAL; } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, - &inlen, &ucmd); + &resp, &inlen, base); if (err) - mlx5_ib_warn(dev, "err %d\n", err); + mlx5_ib_dbg(dev, "err %d\n", err); } else { - if (init_attr->qp_type == IB_QPT_RAW_PACKET) { - mlx5_ib_warn(dev, "Raw Eth QP is disabled for Kernel consumers\n"); - return -EINVAL; - } - err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, + base); if (err) - mlx5_ib_warn(dev, "err %d\n", err); - else - qp->pa_lkey = to_mpd(pd)->pa_lkey; + mlx5_ib_dbg(dev, "err %d\n", err); } if (err) return err; } else { - in = mlx5_vzalloc(sizeof(*in)); + in = mlx5_vzalloc(inlen); if (!in) return -ENOMEM; @@ -1089,26 +1624,29 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; - st = to_mlx5_st(init_attr->qp_type); - if (st < 0) { - mlx5_ib_warn(dev, "invalid service type\n"); - err = st; - goto err_create; - } - in->ctx.flags |= cpu_to_be32(st << 16 | MLX5_QP_PM_MIGRATED << 11); + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, to_mlx5_st(init_attr->qp_type)); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + + if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) + MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); + else + MLX5_SET(qpc, qpc, latency_sensitive, 1); - in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn); if (qp->wq_sig) - in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG); + MLX5_SET(qpc, qpc, wq_signature, 1); if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) - in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + MLX5_SET(qpc, qpc, block_lb_mc, 1); - if (qp->flags & MLX5_IB_QP_CAP_RX_END_PADDING) - in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_ALIGN << 2); - else - in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_NONE << 2); + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + MLX5_SET(qpc, qpc, cd_master, 1); + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + MLX5_SET(qpc, qpc, cd_slave_send, 1); + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; @@ -1117,99 +1655,90 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); - if (rcqe_sz == 128) { - in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE; - } else { - in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE; - } + if (rcqe_sz == 128) + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + else + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); - if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { - in->ctx.cs_req = 0; - } else { + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { if (scqe_sz == 128) - in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE; + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); else - in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE; + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); } } if (qp->rq.wqe_cnt) { - in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4); - in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3; + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); } - in->ctx.rq_type_srqn = get_rx_type(qp, init_attr); + MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, init_attr)); if (qp->sq.wqe_cnt) - in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11); + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); else - in->ctx.sq_crq_size |= cpu_to_be16(0x8000); + MLX5_SET(qpc, qpc, no_sq, 1); /* Set default resources */ switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: - in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); - in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); - in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn); break; case IB_QPT_XRC_INI: - in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn); - in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s0)->msrq.srqn); break; default: if (init_attr->srq) { - in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(init_attr->srq)->msrq.srqn); } else { - in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn); - in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s1)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn, to_msrq(devr->s1)->msrq.srqn); } } if (init_attr->send_cq) - in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(init_attr->send_cq)->mcq.cqn); if (init_attr->recv_cq) - in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(init_attr->recv_cq)->mcq.cqn); - in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); - if (MLX5_CAP_GEN(mdev, cqe_version)) { - qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); - /* 0xffffff means we ask to work with cqe version 0 */ + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) MLX5_SET(qpc, qpc, user_index, uidx); + + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (init_attr->qp_type == IB_QPT_UD && + (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + qp->flags |= MLX5_IB_QP_LSO; } if (init_attr->qp_type == IB_QPT_RAW_PACKET) { - if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) { - mlx5_ib_warn(dev, "Raw Ethernet QP is allowed only for Ethernet link layer\n"); - return -ENOSYS; - } - if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD) { - qp->sq_buf_addr = ucmd.exp.sq_buf_addr; - } else { - mlx5_ib_warn(dev, "Raw Ethernet QP needs SQ buff address\n"); - return -EINVAL; - } - err = -EOPNOTSUPP; + qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; + raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); + err = create_raw_packet_qp(dev, qp, in, pd); } else { - err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); - qp->mqp.event = mlx5_ib_qp_event; + err = mlx5_core_create_qp(dev->mdev, &base->mqp, (struct mlx5_create_qp_mbox_in *)in, inlen); } if (err) { - mlx5_ib_warn(dev, "create qp failed\n"); + mlx5_ib_dbg(dev, "create qp failed\n"); goto err_create; } kvfree(in); - /* Hardware wants QPN written in big-endian order (after - * shifting) for send doorbell. Precompute this value to save - * a little bit when posting sends. - */ - qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); @@ -1232,7 +1761,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, err_create: if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(pd, qp); + destroy_qp_user(pd, qp, base); else if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); @@ -1303,24 +1832,86 @@ static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) return to_mpd(qp->ibqp.pd); } +static void get_cqs(enum ib_qp_type qp_type, + struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, + struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) +{ + switch (qp_type) { + case IB_QPT_XRC_TGT: + *send_cq = NULL; + *recv_cq = NULL; + break; + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_XRC_INI: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = NULL; + break; + + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: + *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; + *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; + break; + + case IB_QPT_MAX: + default: + *send_cq = NULL; + *recv_cq = NULL; + break; + } +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct mlx5_modify_raw_qp_param *raw_qp_param, + u8 lag_tx_affinity); + static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; - struct mlx5_modify_qp_mbox_in *in; + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; unsigned long flags; int err; - in = kzalloc(sizeof(*in), GFP_KERNEL); - if (!in) + if (qp->ibqp.rwq_ind_tbl) { + destroy_rss_raw_qp_tir(dev, qp); return; + } + + base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { - if (mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, in, 0, - &qp->mqp)) - mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", - qp->mqp.qpn); + struct mlx5_modify_qp_mbox_in *in; + + mlx5_ib_qp_disable_pagefaults(qp); + + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (in != NULL) { + err = mlx5_core_qp_modify(dev->mdev, + MLX5_CMD_OP_2RST_QP, + in, 0, &base->mqp); + kfree(in); + } else { + err = -ENOMEM; + } + } else { + struct mlx5_modify_raw_qp_param raw_qp_param = { + .operation = MLX5_CMD_OP_2RST_QP + }; + + err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0); } + if (err) + mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n", + base->mqp.qpn); } get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, @@ -1337,28 +1928,28 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) list_del(&qp->cq_recv_list); if (qp->create_type == MLX5_QP_KERNEL) { - __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) - __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, + NULL); } mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + destroy_raw_packet_qp(dev, qp); } else { - err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); + err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); if (err) mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", - qp->mqp.qpn); + base->mqp.qpn); } - kfree(in); - if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(&get_pd(qp)->ibpd, qp); + destroy_qp_user(&get_pd(qp)->ibpd, qp, base); } static const char *ib_qp_type_str(enum ib_qp_type type) @@ -1384,6 +1975,8 @@ static const char *ib_qp_type_str(enum ib_qp_type type) return "IB_QPT_XRC_TGT"; case IB_QPT_RAW_PACKET: return "IB_QPT_RAW_PACKET"; + case MLX5_IB_QPT_REG_UMR: + return "MLX5_IB_QPT_REG_UMR"; case IB_QPT_MAX: default: return "Invalid QP type"; @@ -1398,17 +1991,25 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp; u16 xrcdn = 0; int err; - u32 rcqn; - u32 scqn; - - init_attr->qpg_type = IB_QPG_NONE; if (pd) { dev = to_mdev(pd->device); + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (!pd->uobject) { + mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); + return ERR_PTR(-EINVAL); + } else if (!to_mucontext(pd->uobject->context)->cqe_version) { + mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); + return ERR_PTR(-EINVAL); + } + } } else { /* being cautious here */ - if (init_attr->qp_type != IB_QPT_XRC_TGT) { - printf("mlx5_ib: WARN: ""%s: no PD for transport %s\n", __func__, ib_qp_type_str(init_attr->qp_type)); + if (init_attr->qp_type != IB_QPT_XRC_TGT && + init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { + pr_warn("%s: no PD for transport %s\n", __func__, + ib_qp_type_str(init_attr->qp_type)); return ERR_PTR(-EINVAL); } dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); @@ -1418,7 +2019,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: if (!MLX5_CAP_GEN(dev->mdev, xrc)) { - mlx5_ib_warn(dev, "XRC not supported\n"); + mlx5_ib_dbg(dev, "XRC not supported\n"); return ERR_PTR(-ENOSYS); } init_attr->recv_cq = NULL; @@ -1428,20 +2029,20 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } /* fall through */ + case IB_QPT_RAW_PACKET: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_RAW_ETHERTYPE: - case IB_QPT_RAW_PACKET: + case MLX5_IB_QPT_HW_GSI: + case MLX5_IB_QPT_REG_UMR: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); err = create_qp_common(dev, pd, init_attr, udata, qp); if (err) { - mlx5_ib_warn(dev, "create_qp_common failed\n"); + mlx5_ib_dbg(dev, "create_qp_common failed\n"); kfree(qp); return ERR_PTR(err); } @@ -1451,22 +2052,26 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, else if (is_qp1(init_attr->qp_type)) qp->ibqp.qp_num = 1; else - qp->ibqp.qp_num = qp->mqp.qpn; + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; - rcqn = init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1; - scqn = init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1; mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", - qp->ibqp.qp_num, qp->mqp.qpn, rcqn, scqn); + qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, + init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1, + init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1); - qp->xrcdn = xrcdn; + qp->trans_qp.xrcdn = xrcdn; break; + case IB_QPT_GSI: + return mlx5_ib_gsi_create_qp(pd, init_attr); + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: case IB_QPT_MAX: default: - mlx5_ib_warn(dev, "unsupported qp type %d\n", - init_attr->qp_type); + mlx5_ib_dbg(dev, "unsupported qp type %d\n", + init_attr->qp_type); /* Don't support raw QPs */ return ERR_PTR(-EINVAL); } @@ -1479,6 +2084,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); + if (unlikely(qp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_destroy_qp(qp); + destroy_qp_common(dev, mqp); kfree(mqp); @@ -1486,28 +2094,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) return 0; } -static u32 atomic_mode_qp(struct mlx5_ib_dev *dev) -{ - unsigned long mask; - unsigned long tmp; - - mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp) & - MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); - - tmp = find_last_bit(&mask, BITS_PER_LONG); - if (tmp < 2 || tmp >= BITS_PER_LONG) - return MLX5_ATOMIC_MODE_NONE; - - if (tmp == 2) - return MLX5_ATOMIC_MODE_CX; - - return tmp << MLX5_ATOMIC_MODE_OFF; -} - static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, int attr_mask) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); u32 hw_access_flags = 0; u8 dest_rd_atomic; u32 access_flags; @@ -1515,12 +2104,12 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else - dest_rd_atomic = qp->resp_depth; + dest_rd_atomic = qp->trans_qp.resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else - access_flags = qp->atomic_rd_en; + access_flags = qp->trans_qp.atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; @@ -1528,8 +2117,7 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att if (access_flags & IB_ACCESS_REMOTE_READ) hw_access_flags |= MLX5_QP_BIT_RRE; if (access_flags & IB_ACCESS_REMOTE_ATOMIC) - hw_access_flags |= (MLX5_QP_BIT_RAE | - atomic_mode_qp(dev)); + hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); if (access_flags & IB_ACCESS_REMOTE_WRITE) hw_access_flags |= MLX5_QP_BIT_RWE; @@ -1558,20 +2146,75 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) return rate + MLX5_STAT_RATE_OFFSET; } -static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, +static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 sl) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); + + err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + + kvfree(in); + + return err; +} + +static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 tx_affinity) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); + + err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + + kvfree(in); + + return err; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr, - int alt) + bool alt) { - enum rdma_link_layer ll = dev->ib_dev.get_link_layer(&dev->ib_dev, - port); + enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port); int err; - int gid_type; - if ((ll == IB_LINK_LAYER_ETHERNET) || (ah->ah_flags & IB_AH_GRH)) { - int len = dev->mdev->port_caps[port - 1].gid_table_len; - if (ah->grh.sgid_index >= len) { - printf("mlx5_ib: ERR: ""sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, len - 1); + if (attr_mask & IB_QP_PKEY_INDEX) + path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : + attr->pkey_index); + + if (ah->ah_flags & IB_AH_GRH) { + if (ah->grh.sgid_index >= + dev->mdev->port_caps[port - 1].gid_table_len) { + pr_err("sgid_index (%u) too large. max is %d\n", + ah->grh.sgid_index, + dev->mdev->port_caps[port - 1].gid_table_len); return -EINVAL; } } @@ -1579,32 +2222,21 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, if (ll == IB_LINK_LAYER_ETHERNET) { if (!(ah->ah_flags & IB_AH_GRH)) return -EINVAL; - - err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index, - &gid_type); - if (err) - return err; memcpy(path->rmac, ah->dmac, sizeof(ah->dmac)); path->udp_sport = mlx5_get_roce_udp_sport(dev, port, - ah->grh.sgid_index, - 0); - path->dci_cfi_prio_sl = (ah->sl & 0xf) << 4; + ah->grh.sgid_index); + path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4; } else { path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; - path->grh_mlid = ah->src_path_bits & 0x7f; - path->rlid = cpu_to_be16(ah->dlid); + path->fl_free_ar |= + (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; + path->rlid = cpu_to_be16(ah->dlid); + path->grh_mlid = ah->src_path_bits & 0x7f; if (ah->ah_flags & IB_AH_GRH) path->grh_mlid |= 1 << 7; - if (attr_mask & IB_QP_PKEY_INDEX) - path->pkey_index = cpu_to_be16(alt ? - attr->alt_pkey_index : - attr->pkey_index); - path->dci_cfi_prio_sl = ah->sl & 0xf; } - path->fl_free_ar |= (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; - if (ah->ah_flags & IB_AH_GRH) { path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; @@ -1621,7 +2253,12 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, path->port = port; if (attr_mask & IB_QP_TIMEOUT) - path->ackto_lt = alt ? attr->alt_timeout << 3 : attr->timeout << 3; + path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3; + + if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) + return modify_raw_packet_eth_prio(dev->mdev, + &qp->raw_packet_qp.sq, + ah->sl & 0xf); return 0; } @@ -1640,10 +2277,6 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_PRI_PORT, - [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PRI_PORT | - MLX5_QP_OPTPAR_DC_KEY | - MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_RAE, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | @@ -1663,9 +2296,6 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX, - [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_RAE | - MLX5_QP_OPTPAR_DC_KEY, }, }, [MLX5_QP_STATE_RTR] = { @@ -1680,9 +2310,6 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PM_STATE, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, - [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | - MLX5_QP_OPTPAR_PM_STATE | - MLX5_QP_OPTPAR_RAE, }, }, [MLX5_QP_STATE_RTS] = { @@ -1699,26 +2326,9 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_SRQN | MLX5_QP_OPTPAR_CQN_RCV, - [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | - MLX5_QP_OPTPAR_PM_STATE | - MLX5_QP_OPTPAR_RAE, }, }, [MLX5_QP_STATE_SQER] = { - [MLX5_QP_STATE_RTS] = { - [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, - [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, - [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE, - [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RNR_TIMEOUT | - MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_RAE | - MLX5_QP_OPTPAR_RRE, - [MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY | - MLX5_QP_OPTPAR_RAE, - - }, - }, - [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RTS] = { [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY, @@ -1796,6 +2406,134 @@ static int ib_mask_to_mlx5_opt(int ib_mask) return result; } +static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param) +{ + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rq_in, in, rqn, rq->base.mqp.qpn); + MLX5_SET(modify_rq_in, in, rq_state, rq->state); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + MLX5_SET(rqc, rqc, state, new_state); + + if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) { + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counters_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); + } else + pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", + dev->ib_dev.name); + } + + err = mlx5_core_modify_rq(dev->mdev, in, inlen); + if (err) + goto out; + + rq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, int new_state) +{ + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, sqn, sq->base.mqp.qpn); + MLX5_SET(modify_sq_in, in, sq_state, sq->state); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, new_state); + + err = mlx5_core_modify_sq(dev, in, inlen); + if (err) + goto out; + + sq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct mlx5_modify_raw_qp_param *raw_qp_param, + u8 tx_affinity) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + int rq_state; + int sq_state; + int err; + + switch (raw_qp_param->operation) { + case MLX5_CMD_OP_RST2INIT_QP: + rq_state = MLX5_RQC_STATE_RDY; + sq_state = MLX5_SQC_STATE_RDY; + break; + case MLX5_CMD_OP_2ERR_QP: + rq_state = MLX5_RQC_STATE_ERR; + sq_state = MLX5_SQC_STATE_ERR; + break; + case MLX5_CMD_OP_2RST_QP: + rq_state = MLX5_RQC_STATE_RST; + sq_state = MLX5_SQC_STATE_RST; + break; + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + if (raw_qp_param->set_mask) + return -EINVAL; + else + return 0; + default: + WARN_ON(1); + return -EINVAL; + } + + if (qp->rq.wqe_cnt) { + err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param); + if (err) + return err; + } + + if (qp->sq.wqe_cnt) { + if (tx_affinity) { + err = modify_raw_packet_tx_affinity(dev->mdev, sq, + tx_affinity); + if (err) + return err; + } + + return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state); + } + + return 0; +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1825,7 +2563,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, [MLX5_QP_STATE_SQD] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQD_RTS_QP, }, [MLX5_QP_STATE_SQER] = { [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, @@ -1840,10 +2577,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; struct mlx5_modify_qp_mbox_in *in; struct mlx5_ib_pd *pd; + struct mlx5_ib_port *mibport = NULL; enum mlx5_qp_state mlx5_cur, mlx5_new; enum mlx5_qp_optpar optpar; int sqd_event; @@ -1857,8 +2596,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context = &in->ctx; err = to_mlx5_st(ibqp->qp_type); - if (err < 0) + if (err < 0) { + mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; + } context->flags = cpu_to_be32(err << 16); @@ -1878,9 +2619,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; - } else if (ibqp->qp_type == IB_QPT_UD) { + } else if (ibqp->qp_type == IB_QPT_UD || + ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || @@ -1908,9 +2650,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.port = attr->port_num; if (attr_mask & IB_QP_AV) { - err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, - attr_mask, 0, attr, 0); + attr_mask, 0, attr, false); if (err) goto out; } @@ -1919,10 +2661,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.ackto_lt |= attr->timeout << 3; if (attr_mask & IB_QP_ALT_PATH) { - err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, + &context->alt_path, attr->alt_port_num, - attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, - 0, attr, 1); + attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + 0, attr, true); if (err) goto out; } @@ -1949,7 +2692,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_SQ_PSN) - context->next_send_psn = cpu_to_be32(attr->sq_psn & 0xffffff); + context->next_send_psn = cpu_to_be32(attr->sq_psn); if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { if (attr->max_dest_rd_atomic) @@ -1957,14 +2700,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); } - if ((attr_mask & IB_QP_ACCESS_FLAGS) && - (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) && - !dev->enable_atomic_resp) { - mlx5_ib_warn(dev, "atomic responder is not supported\n"); - err = -EINVAL; - goto out; - } - if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); @@ -1972,7 +2707,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); if (attr_mask & IB_QP_RQ_PSN) - context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn & 0xffffff); + context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); if (attr_mask & IB_QP_QKEY) context->qkey = cpu_to_be32(attr->qkey); @@ -1986,61 +2721,85 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, else sqd_event = 0; - if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - context->sq_crq_size |= cpu_to_be16(1 << 4); - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : qp->port) - 1; - struct mlx5_ib_port *mibport = &dev->port[port_num]; - + mibport = &dev->port[port_num]; context->qp_counter_set_usr_page |= - cpu_to_be32(mibport->q_cnt_id << 24); + cpu_to_be32((u32)(mibport->q_cnt_id) << 24); } + if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + context->sq_crq_size |= cpu_to_be16(1 << 4); + + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + context->deth_sqpn = cpu_to_be32(1); + mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); mlx5_st = to_mlx5_st(ibqp->qp_type); if (mlx5_st < 0) goto out; + /* If moving to a reset or error state, we must disable page faults on + * this QP and flush all current page faults. Otherwise a stale page + * fault may attempt to work on this QP after it is reset and moved + * again to RTS, and may cause the driver and the device to get out of + * sync. */ + if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) && + (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) + mlx5_ib_qp_disable_pagefaults(qp); + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || !optab[mlx5_cur][mlx5_new]) - return -EINVAL; + goto out; op = optab[mlx5_cur][mlx5_new]; optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; in->optparam = cpu_to_be32(optpar); - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) - err = -EOPNOTSUPP; - else - err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, - &qp->mqp); + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + struct mlx5_modify_raw_qp_param raw_qp_param = {}; + + raw_qp_param.operation = op; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + raw_qp_param.rq_q_ctr_id = mibport->q_cnt_id; + raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; + } + err = modify_raw_packet_qp(dev, qp, &raw_qp_param, 0); + } else { + err = mlx5_core_qp_modify(dev->mdev, op, in, 0, &base->mqp); + } + if (err) goto out; + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT && + (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) + mlx5_ib_qp_enable_pagefaults(qp); + qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) - qp->atomic_rd_en = attr->qp_access_flags; + qp->trans_qp.atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - qp->resp_depth = attr->max_dest_rd_atomic; + qp->trans_qp.resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_ALT_PATH) - qp->alt_port = attr->alt_port_num; + qp->trans_qp.alt_port = attr->alt_port_num; /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) - mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; @@ -2048,10 +2807,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, qp->sq.tail = 0; qp->sq.cur_post = 0; qp->sq.last_poll = 0; - if (qp->db.db) { - qp->db.db[MLX5_RCV_DBR] = 0; - qp->db.db[MLX5_SND_DBR] = 0; - } + qp->db.db[MLX5_RCV_DBR] = 0; + qp->db.db[MLX5_SND_DBR] = 0; } out: @@ -2059,56 +2816,76 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, return err; } -static int ignored_ts_check(enum ib_qp_type qp_type) -{ - return 0; -} - int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; - + + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); + + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - + if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } - if (!ignored_ts_check(ibqp->qp_type) && - !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, ll)) + if (qp_type != MLX5_IB_QPT_REG_UMR && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || - attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); goto out; + } if (attr_mask & IB_QP_PKEY_INDEX) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; if (attr->pkey_index >= - dev->mdev->port_caps[port - 1].pkey_table_len) + dev->mdev->port_caps[port - 1].pkey_table_len) { + mlx5_ib_dbg(dev, "invalid pkey index %d\n", + attr->pkey_index); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp))) { + mlx5_ib_dbg(dev, "invalid max_rd_atomic value %d\n", + attr->max_rd_atomic); goto out; + } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > - (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) + (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp))) { + mlx5_ib_dbg(dev, "invalid max_dest_rd_atomic value %d\n", + attr->max_dest_rd_atomic); goto out; + } if (cur_state == new_state && cur_state == IB_QPS_RESET) { err = 0; @@ -2147,12 +2924,65 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, rseg->reserved = 0; } +static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, + struct ib_send_wr *wr, void *qend, + struct mlx5_ib_qp *qp, int *size) +{ + void *seg = eseg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); + u64 left, leftlen, copysz; + void *pdata = ud_wr->header; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr_sz = cpu_to_be16(left); + + /* + * check if there is space till the end of queue, if yes, + * copy all in one shot, otherwise copy till the end of queue, + * rollback and than the copy the left + */ + leftlen = qend - (void *)eseg->inline_hdr_start; + copysz = min_t(u64, leftlen, left); + + memcpy(seg - size_of_inl_hdr_start, pdata, copysz); + + if (likely(copysz > size_of_inl_hdr_start)) { + seg += ALIGN(copysz - size_of_inl_hdr_start, 16); + *size += ALIGN(copysz - size_of_inl_hdr_start, 16) / 16; + } + + if (unlikely(copysz < left)) { /* the last wqe in the queue */ + seg = mlx5_get_send_wqe(qp, 0); + left -= copysz; + pdata += copysz; + memcpy(seg, pdata, left); + seg += ALIGN(left, 16); + *size += ALIGN(left, 16) / 16; + } + } + + return seg; +} + static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, struct ib_send_wr *wr) { - memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av)); - dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV); - dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey); } static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) @@ -2187,20 +3017,158 @@ static __be64 frwr_mkey_mask(void) return cpu_to_be64(result); } -static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, - struct ib_send_wr *wr, int li) +static __be64 sig_mkey_mask(void) { + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, + struct mlx5_ib_mr *mr) +{ + int ndescs = mr->ndescs; + memset(umr, 0, sizeof(*umr)); - if (li) { - umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); - umr->flags = 1 << 7; - return; + if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + umr->flags = MLX5_UMR_CHECK_NOT_FREE; + umr->klm_octowords = get_klm_octo(ndescs); + umr->mkey_mask = frwr_mkey_mask(); +} + +static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) +{ + memset(umr, 0, sizeof(*umr)); + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = 1 << 7; +} + +static __be64 get_umr_reg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_unreg_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_mtt_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_A | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(umr, 0, sizeof(*umr)); + + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ + else + umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ + + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { + umr->klm_octowords = get_klm_octo(umrwr->npages); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { + umr->mkey_mask = get_umr_update_mtt_mask(); + umr->bsf_octowords = get_klm_octo(umrwr->target.offset); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) + umr->mkey_mask |= get_umr_update_access_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) + umr->mkey_mask |= get_umr_update_pd_mask(); + if (!umr->mkey_mask) + umr->mkey_mask = get_umr_reg_mr_mask(); + } else { + umr->mkey_mask = get_umr_unreg_mr_mask(); } - umr->flags = (1 << 5); /* fail if not free */ - umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len); - umr->mkey_mask = frwr_mkey_mask(); + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; } static u8 get_umr_flags(int acc) @@ -2212,42 +3180,65 @@ static u8 get_umr_flags(int acc) MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; } -static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, - int li, int *writ) +static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, + struct mlx5_ib_mr *mr, + u32 key, int access) +{ + int ndescs = ALIGN(mr->ndescs, 8) >> 1; + + memset(seg, 0, sizeof(*seg)); + + if (mr->access_mode == MLX5_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_ACCESS_MODE_KLM) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; + seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(mr->ibmr.iova); + seg->len = cpu_to_be64(mr->ibmr.length); + seg->xlt_oct_size = cpu_to_be32(ndescs); +} + +static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) { memset(seg, 0, sizeof(*seg)); - if (li) { + seg->status = MLX5_MKEY_STATUS_FREE; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +{ + struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { seg->status = MLX5_MKEY_STATUS_FREE; return; } - seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) | - MLX5_ACCESS_MODE_MTT; - *writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE); - seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00); - seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); - seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); - seg->len = cpu_to_be64(wr->wr.fast_reg.length); - seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2); - seg->log2_page_size = wr->wr.fast_reg.page_shift; + seg->flags = convert_access(umrwr->access_flags); + if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); + } + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(umrwr->mkey)); } -static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, - struct ib_send_wr *wr, - struct mlx5_core_dev *mdev, - struct mlx5_ib_pd *pd, - int writ) +static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, + struct mlx5_ib_mr *mr, + struct mlx5_ib_pd *pd) { - struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list); - u64 *page_list = wr->wr.fast_reg.page_list->page_list; - u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0); - int i; + int bcount = mr->desc_size * mr->ndescs; - for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) - mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm); - dseg->addr = cpu_to_be64(mfrpl->map); - dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64)); - dseg->lkey = cpu_to_be32(pd->pa_lkey); + dseg->addr = cpu_to_be64(mr->desc_map); + dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); + dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); } static __be32 send_ieth(struct ib_send_wr *wr) @@ -2277,7 +3268,7 @@ static u8 calc_sig(void *wqe, int size) return ~res; } -static u8 calc_wq_sig(void *wqe) +static u8 wq_sig(void *wqe) { return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); } @@ -2296,7 +3287,7 @@ static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, seg = wqe; wqe += sizeof(*seg); for (i = 0; i < wr->num_sge; i++) { - addr = (void *)(uintptr_t)(wr->sg_list[i].addr); + addr = (void *)(unsigned long)(wr->sg_list[i].addr); len = wr->sg_list[i].length; inl += len; @@ -2304,7 +3295,7 @@ static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, return -ENOMEM; if (unlikely(wqe + len > qend)) { - copy = (int)(qend - wqe); + copy = qend - wqe; memcpy(wqe, addr, copy); addr += copy; len -= copy; @@ -2321,38 +3312,368 @@ static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, return 0; } -static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, - struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp) +static u16 prot_field_size(enum ib_signature_type type) { - int writ = 0; - int li; + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} - li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0; - if (unlikely(wr->send_flags & IB_SEND_INLINE)) +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + +static int set_sig_data_segment(struct ib_sig_handover_wr *wr, + struct mlx5_ib_qp *qp, void **seg, int *size) +{ + struct ib_sig_attrs *sig_attrs = wr->sig_attrs; + struct ib_mr *sig_mr = wr->sig_mr; + struct mlx5_bsf *bsf; + u32 data_len = wr->wr.sg_list->length; + u32 data_key = wr->wr.sg_list->lkey; + u64 data_va = wr->wr.sg_list->addr; + int ret; + int wqe_size; + + if (!wr->prot || + (data_key == wr->prot->lkey && + data_va == wr->prot->addr && + data_len == wr->prot->length)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u32 prot_key = wr->prot->lkey; + u64 prot_va = wr->prot->addr; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) return -EINVAL; - set_frwr_umr_segment(*seg, wr, li); + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_sig_handover_wr *wr, u32 nelements, + u32 length, u32 pdn) +{ + struct ib_mr *sig_mr = wr->sig_mr; + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(wr->access_flags) | + MLX5_ACCESS_MODE_KLM; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + u32 nelements) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->klm_octowords = get_klm_octo(nelements); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + + +static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, + void **seg, int *size) +{ + struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); + u32 pdn = get_pd(qp)->pdn; + u32 klm_oct_size; + int region_len, ret; + + if (unlikely(wr->wr.num_sge != 1) || + unlikely(wr->access_flags & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = wr->wr.sg_list->length; + if (wr->prot && + (wr->prot->lkey != wr->wr.sg_list->lkey || + wr->prot->addr != wr->wr.sg_list->addr || + wr->prot->length != wr->wr.sg_list->length)) + region_len += wr->prot->length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + klm_oct_size = wr->prot ? 3 : 1; + + set_sig_umr_segment(*seg, klm_oct_size); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - set_mkey_segment(*seg, wr, li, &writ); + + set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - if (!li) { - if (unlikely(wr->wr.fast_reg.page_list_len > - wr->wr.fast_reg.page_list->max_page_list_len)) - return -ENOMEM; - set_frwr_pages(*seg, wr, mdev, pd, writ); - *seg += sizeof(struct mlx5_wqe_data_seg); - *size += (sizeof(struct mlx5_wqe_data_seg) / 16); - } + ret = set_sig_data_segment(wr, qp, seg, size); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; return 0; } +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type given.\n"); + return 1; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_reg_wr(struct mlx5_ib_qp *qp, + struct ib_reg_wr *wr, + void **seg, int *size) +{ + struct mlx5_ib_mr *mr = to_mmr(wr->mr); + struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + + if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), + "Invalid IB_SEND_INLINE send flag\n"); + return -EINVAL; + } + + set_reg_umr_seg(*seg, mr); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_reg_mkey_seg(*seg, mr, wr->key, wr->access); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + + return 0; +} + +static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size) +{ + set_linv_umr_seg(*seg); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); + set_linv_mkey_seg(*seg); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((*seg == qp->sq.qend))) + *seg = mlx5_get_send_wqe(qp, 0); +} + static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) { __be32 *p = NULL; @@ -2402,10 +3723,11 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr) return MLX5_FENCE_MODE_SMALL_AND_FENCE; else return fence; - - } else { - return 0; + } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) { + return MLX5_FENCE_MODE_FENCE; } + + return 0; } static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, @@ -2413,18 +3735,13 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, struct ib_send_wr *wr, unsigned *idx, int *size, int nreq) { - int err = 0; - - if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) { - mlx5_ib_warn(to_mdev(qp->ibqp.device), "work queue overflow\n"); - err = -ENOMEM; - return err; - } + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) + return -ENOMEM; *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); *seg = mlx5_get_send_wqe(qp, *idx); *ctrl = *seg; - *(u32 *)(*seg + 8) = 0; + *(uint32_t *)(*seg + 8) = 0; (*ctrl)->imm = send_ieth(wr); (*ctrl)->fm_ce_se = qp->sq_signal_bits | (wr->send_flags & IB_SEND_SIGNALED ? @@ -2435,13 +3752,12 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, *seg += sizeof(**ctrl); *size = sizeof(**ctrl) / 16; - return err; + return 0; } static void finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, - u8 size, unsigned idx, - struct ib_send_wr *wr, + u8 size, unsigned idx, u64 wr_id, int nreq, u8 fence, u8 next_fence, u32 mlx5_opcode) { @@ -2449,32 +3765,33 @@ static void finish_wqe(struct mlx5_ib_qp *qp, ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | mlx5_opcode | ((u32)opmod << 24)); - ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); ctrl->fm_ce_se |= fence; qp->fm_cache = next_fence; if (unlikely(qp->wq_sig)) - ctrl->signature = calc_wq_sig(ctrl); + ctrl->signature = wq_sig(ctrl); - qp->sq.swr_ctx[idx].wrid = wr->wr_id; - qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode; - qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq; + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); - qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post; - qp->sq.swr_ctx[idx].sig_piped = 0; + qp->sq.w_list[idx].next = qp->sq.cur_post; } + int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp *qp; + struct mlx5_ib_mr *mr; struct mlx5_wqe_data_seg *dpseg; struct mlx5_wqe_xrc_seg *xrc; - struct mlx5_bf *bf = qp->bf; + struct mlx5_bf *bf; int uninitialized_var(size); - void *qend = qp->sq.qend; + void *qend; unsigned long flags; unsigned idx; int err = 0; @@ -2486,6 +3803,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, u8 next_fence = 0; u8 fence; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + qp = to_mqp(ibqp); + bf = qp->bf; + qend = qp->sq.qend; spin_lock_irqsave(&qp->sq.lock, flags); @@ -2498,7 +3821,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { - mlx5_ib_warn(dev, "Invalid opcode 0x%x\n", wr->opcode); + mlx5_ib_warn(dev, "\n"); err = -EINVAL; *bad_wr = wr; goto out; @@ -2507,15 +3830,15 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fence = qp->fm_cache; num_sge = wr->num_sge; if (unlikely(num_sge > qp->sq.max_gs)) { - mlx5_ib_warn(dev, "Max gs exceeded %d (max = %d)\n", wr->num_sge, qp->sq.max_gs); - err = -ENOMEM; + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; *bad_wr = wr; goto out; } err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq); if (err) { - mlx5_ib_warn(dev, "Failed to prepare WQE\n"); + mlx5_ib_warn(dev, "\n"); err = -ENOMEM; *bad_wr = wr; goto out; @@ -2524,7 +3847,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, switch (ibqp->qp_type) { case IB_QPT_XRC_INI: xrc = seg; - xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num); seg += sizeof(*xrc); size += sizeof(*xrc) / 16; /* fall through */ @@ -2533,8 +3855,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: - set_raddr_seg(seg, wr->wr.rdma.remote_addr, - wr->wr.rdma.rkey); + set_raddr_seg(seg, rdma_wr(wr)->remote_addr, + rdma_wr(wr)->rkey); seg += sizeof(struct mlx5_wqe_raddr_seg); size += sizeof(struct mlx5_wqe_raddr_seg) / 16; break; @@ -2549,29 +3871,90 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, case IB_WR_LOCAL_INV: next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; - qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV; + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); - err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + set_linv_wr(qp, &seg, &size); + num_sge = 0; + break; + + case IB_WR_REG_MR: + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + qp->sq.wr_data[idx] = IB_WR_REG_MR; + ctrl->imm = cpu_to_be32(reg_wr(wr)->key); + err = set_reg_wr(qp, reg_wr(wr), &seg, &size); if (err) { - mlx5_ib_warn(dev, "Failed to prepare LOCAL_INV WQE\n"); *bad_wr = wr; goto out; } num_sge = 0; break; - case IB_WR_FAST_REG_MR: - next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; - qp->sq.swr_ctx[idx].wr_data = IB_WR_FAST_REG_MR; - ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey); - err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp); + case IB_WR_REG_SIG_MR: + qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; + mr = to_mmr(sig_handover_wr(wr)->sig_mr); + + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + err = set_sig_umr_wr(wr, qp, &seg, &size); if (err) { - mlx5_ib_warn(dev, "Failed to prepare FAST_REG_MR WQE\n"); + mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_UMR); + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + wr->send_flags &= ~IB_SEND_SIGNALED; + wr->send_flags |= IB_SEND_SOLICITED; + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->mem, + mr->sig->psv_memory.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); + err = begin_wqe(qp, &seg, &ctrl, wr, + &idx, &size, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + err = set_psv_wr(&sig_handover_wr(wr)->sig_attrs->wire, + mr->sig->psv_wire.psv_idx, &seg, + &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + + finish_wqe(qp, ctrl, size, idx, wr->wr_id, + nreq, get_fence(fence, wr), + next_fence, MLX5_OPCODE_SET_PSV); num_sge = 0; - break; + goto skip_psv; default: break; @@ -2582,8 +3965,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_RDMA_WRITE: case IB_WR_RDMA_WRITE_WITH_IMM: - set_raddr_seg(seg, wr->wr.rdma.remote_addr, - wr->wr.rdma.rkey); + set_raddr_seg(seg, rdma_wr(wr)->remote_addr, + rdma_wr(wr)->rkey); seg += sizeof(struct mlx5_wqe_raddr_seg); size += sizeof(struct mlx5_wqe_raddr_seg) / 16; break; @@ -2594,20 +3977,56 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case IB_QPT_SMI: - if (!mlx5_core_is_pf(mdev)) { - err = -EINVAL; - mlx5_ib_warn(dev, "Only physical function is allowed to send SMP MADs\n"); - *bad_wr = wr; - goto out; - } - case IB_QPT_GSI: - case IB_QPT_UD: + case MLX5_IB_QPT_HW_GSI: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); size += sizeof(struct mlx5_wqe_datagram_seg) / 16; if (unlikely((seg == qend))) seg = mlx5_get_send_wqe(qp, 0); break; + case IB_QPT_UD: + set_datagram_seg(seg, wr); + seg += sizeof(struct mlx5_wqe_datagram_seg); + size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + seg += sizeof(struct mlx5_wqe_eth_pad); + size += sizeof(struct mlx5_wqe_eth_pad) / 16; + + seg = set_eth_seg(seg, wr, qend, qp, &size); + + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + } + break; + case MLX5_IB_QPT_REG_UMR: + if (wr->opcode != MLX5_IB_WR_UMR) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode\n"); + goto out; + } + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey); + set_reg_umr_segment(seg, wr); + seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + set_reg_mkey_segment(seg, wr); + seg += sizeof(struct mlx5_mkey_seg); + size += sizeof(struct mlx5_mkey_seg) / 16; + if (unlikely((seg == qend))) + seg = mlx5_get_send_wqe(qp, 0); + break; + default: break; } @@ -2617,7 +4036,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, err = set_data_inl_seg(qp, wr, seg, &sz); if (unlikely(err)) { - mlx5_ib_warn(dev, "Failed to prepare inline data segment\n"); + mlx5_ib_warn(dev, "\n"); *bad_wr = wr; goto out; } @@ -2638,9 +4057,10 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } } - finish_wqe(qp, ctrl, size, idx, wr, nreq, + finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, get_fence(fence, wr), next_fence, mlx5_ib_opcode[wr->opcode]); +skip_psv: if (0) dump_wqe(qp, idx, size); } @@ -2666,8 +4086,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __acquire(&bf->lock); /* TBD enable WC */ - if (BF_ENABLE && nreq == 1 && bf->uuarn && inl && size > 1 && - size <= bf->buf_size / 16) { + if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); /* wc_wmb(); */ } else { @@ -2709,6 +4128,9 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int ind; int i; + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + spin_lock_irqsave(&qp->rq.lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { @@ -2751,7 +4173,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); } - qp->rq.rwr_ctx[ind].wrid = wr->wr_id; + qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (qp->rq.wqe_cnt - 1); } @@ -2842,74 +4264,232 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at } } -int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, - struct ib_qp_init_attr *qp_init_attr) +static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u8 *sq_state) +{ + void *out; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_sq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out); + if (err) + goto out; + + sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context); + *sq_state = MLX5_GET(sqc, sqc, state); + sq->state = *sq_state; + +out: + kvfree(out); + return err; +} + +static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u8 *rq_state) +{ + void *out; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_rq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *rq_state = MLX5_GET(rqc, rqc, state); + rq->state = *rq_state; + +out: + kvfree(out); + return err; +} + +static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, + struct mlx5_ib_qp *qp, u8 *qp_state) +{ + static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { + [MLX5_RQC_STATE_RST] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, + [MLX5_SQ_STATE_NA] = IB_QPS_RESET, + }, + [MLX5_RQC_STATE_RDY] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE, + }, + [MLX5_RQC_STATE_ERR] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, + [MLX5_SQ_STATE_NA] = IB_QPS_ERR, + }, + [MLX5_RQ_STATE_NA] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD, + }, + }; + + *qp_state = sqrq_trans[rq_state][sq_state]; + + if (*qp_state == MLX5_QP_STATE_BAD) { + WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x", + qp->raw_packet_qp.sq.base.mqp.qpn, sq_state, + qp->raw_packet_qp.rq.base.mqp.qpn, rq_state); + return -EINVAL; + } + + if (*qp_state == MLX5_QP_STATE) + *qp_state = qp->state; + + return 0; +} + +static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u8 *raw_packet_qp_state) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + int err; + u8 sq_state = MLX5_SQ_STATE_NA; + u8 rq_state = MLX5_RQ_STATE_NA; + + if (qp->sq.wqe_cnt) { + err = query_raw_packet_qp_sq_state(dev, sq, &sq_state); + if (err) + return err; + } + + if (qp->rq.wqe_cnt) { + err = query_raw_packet_qp_rq_state(dev, rq, &rq_state); + if (err) + return err; + } + + return sqrq_state_to_qp_state(sq_state, rq_state, qp, + raw_packet_qp_state); +} + +static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_attr *qp_attr) +{ + int outlen = MLX5_ST_SZ_BYTES(query_qp_out); + struct mlx5_qp_context *context; + int mlx5_state; + u32 *outb; + int err = 0; + + outb = kzalloc(outlen, GFP_KERNEL); + if (!outb) + return -ENOMEM; + + err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, + (struct mlx5_query_qp_mbox_out *)outb, + outlen); + if (err) + goto out; + + /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ + context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc); + + mlx5_state = be32_to_cpu(context->flags) >> 28; + + qp->state = to_ib_qp_state(mlx5_state); + qp_attr->path_mtu = context->mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context->qkey); + qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context->params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); + qp_attr->alt_pkey_index = + be16_to_cpu(context->alt_path.pkey_index); + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); + qp_attr->port_num = context->pri_path.port; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context->pri_path.ackto_lt >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; + qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + +out: + kfree(outb); + return err; +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_query_qp_mbox_out *outb; - struct mlx5_qp_context *context; - int mlx5_state; int err = 0; + u8 raw_packet_qp_state; + + if (ibqp->rwq_ind_tbl) + return -ENOSYS; + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, + qp_init_attr); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif mutex_lock(&qp->mutex); + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { - err = -EOPNOTSUPP; - goto out; + err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); + if (err) + goto out; + qp->state = raw_packet_qp_state; + qp_attr->port_num = 1; } else { - outb = kzalloc(sizeof(*outb), GFP_KERNEL); - if (!outb) { - err = -ENOMEM; + err = query_qp_attr(dev, qp, qp_attr); + if (err) goto out; - } - - context = &outb->ctx; - err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, - sizeof(*outb)); - if (err) { - kfree(outb); - goto out; - } - - mlx5_state = be32_to_cpu(context->flags) >> 28; - - qp->state = to_ib_qp_state(mlx5_state); - qp_attr->path_mtu = context->mtu_msgmax >> 5; - qp_attr->path_mig_state = - to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); - qp_attr->qkey = be32_to_cpu(context->qkey); - qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; - qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; - qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; - qp_attr->qp_access_flags = - to_ib_qp_access_flags(be32_to_cpu(context->params2)); - - if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { - to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); - to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); - qp_attr->alt_pkey_index = be16_to_cpu(context->alt_path.pkey_index); - qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; - } - - qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); - qp_attr->port_num = context->pri_path.port; - - /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ - qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; - - qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); - - qp_attr->max_dest_rd_atomic = - 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); - qp_attr->min_rnr_timer = - (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; - qp_attr->timeout = context->pri_path.ackto_lt >> 3; - qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; - qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; - qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; - - - kfree(outb); } qp_attr->qp_state = qp->state; @@ -2938,6 +4518,15 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + if (qp->flags & MLX5_IB_QP_SQPN_QP1) + qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1(); + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; @@ -2986,3 +4575,355 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) return 0; } + +static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type) +{ + struct mlx5_ib_rwq *rwq = to_mibrwq(core_qp); + struct mlx5_ib_dev *dev = to_mdev(rwq->ibwq.device); + struct ib_event event; + + if (rwq->ibwq.event_handler) { + event.device = rwq->ibwq.device; + event.element.wq = &rwq->ibwq; + switch (type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_WQ_FATAL; + break; + default: + mlx5_ib_warn(dev, "Unexpected event type %d on WQ %06x\n", type, core_qp->qpn); + return; + } + + rwq->ibwq.event_handler(&event, rwq->ibwq.wq_context); + } +} + +static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, + struct ib_wq_init_attr *init_attr) +{ + struct mlx5_ib_dev *dev; + __be64 *rq_pas0; + void *in; + void *rqc; + void *wq; + int inlen; + int err; + + dev = to_mdev(pd->device); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rwq->rq_num_pas; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, mem_rq_type, + MLX5_RQC_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, user_index, rwq->user_index); + MLX5_SET(rqc, rqc, cqn, to_mcq(init_attr->cq)->mcq.cqn); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); + MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride); + MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size); + MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn); + MLX5_SET(wq, wq, page_offset, rwq->rq_page_offset); + MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); + MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); + MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); + rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); + kvfree(in); + return err; +} + +static int set_user_rq_size(struct mlx5_ib_dev *dev, + struct ib_wq_init_attr *wq_init_attr, + struct mlx5_ib_create_wq *ucmd, + struct mlx5_ib_rwq *rwq) +{ + /* Sanity check RQ size before proceeding */ + if (wq_init_attr->max_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_wq_sz))) + return -EINVAL; + + if (!ucmd->rq_wqe_count) + return -EINVAL; + + rwq->wqe_count = ucmd->rq_wqe_count; + rwq->wqe_shift = ucmd->rq_wqe_shift; + rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift); + rwq->log_rq_stride = rwq->wqe_shift; + rwq->log_rq_size = ilog2(rwq->wqe_count); + return 0; +} + +static int prepare_user_rq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata, + struct mlx5_ib_rwq *rwq) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_create_wq ucmd = {}; + int err; + size_t required_cmd_sz; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) { + mlx5_ib_dbg(dev, "invalid inlen\n"); + return -EINVAL; + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + mlx5_ib_dbg(dev, "inlen is not supported\n"); + return -EOPNOTSUPP; + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + mlx5_ib_dbg(dev, "copy failed\n"); + return -EFAULT; + } + + if (ucmd.comp_mask) { + mlx5_ib_dbg(dev, "invalid comp mask\n"); + return -EOPNOTSUPP; + } + + if (ucmd.reserved) { + mlx5_ib_dbg(dev, "invalid reserved\n"); + return -EOPNOTSUPP; + } + + err = set_user_rq_size(dev, init_attr, &ucmd, rwq); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + err = create_user_rq(dev, pd, rwq, &ucmd); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + if (err) + return err; + } + + rwq->user_index = ucmd.user_index; + return 0; +} + +struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_rwq *rwq; + struct mlx5_ib_create_wq_resp resp = {}; + size_t min_resp_len; + int err; + + if (!udata) + return ERR_PTR(-ENOSYS); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + dev = to_mdev(pd->device); + switch (init_attr->wq_type) { + case IB_WQT_RQ: + rwq = kzalloc(sizeof(*rwq), GFP_KERNEL); + if (!rwq) + return ERR_PTR(-ENOMEM); + err = prepare_user_rq(pd, init_attr, udata, rwq); + if (err) + goto err; + err = create_rq(rwq, pd, init_attr); + if (err) + goto err_user_rq; + break; + default: + mlx5_ib_dbg(dev, "unsupported wq type %d\n", + init_attr->wq_type); + return ERR_PTR(-EINVAL); + } + + rwq->ibwq.wq_num = rwq->core_qp.qpn; + rwq->ibwq.state = IB_WQS_RESET; + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + rwq->core_qp.event = mlx5_ib_wq_event; + rwq->ibwq.event_handler = init_attr->event_handler; + return &rwq->ibwq; + +err_copy: + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); +err_user_rq: + destroy_user_rq(pd, rwq); +err: + kfree(rwq); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_wq(struct ib_wq *wq) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + destroy_user_rq(wq->pd, rwq); + kfree(rwq); + + return 0; +} + +struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl; + int sz = 1 << init_attr->log_ind_tbl_size; + struct mlx5_ib_create_rwq_ind_tbl_resp resp = {}; + size_t min_resp_len; + int inlen; + int err; + int i; + u32 *in; + void *rqtc; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + if (init_attr->log_ind_tbl_size > + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) { + mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n", + init_attr->log_ind_tbl_size, + MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)); + return ERR_PTR(-EINVAL); + } + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL); + if (!rwq_ind_tbl) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err; + } + + rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); + + MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); + MLX5_SET(rqtc, rqtc, rqt_max_size, sz); + + for (i = 0; i < sz; i++) + MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); + kvfree(in); + + if (err) + goto err; + + rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn; + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err_copy; + } + + return &rwq_ind_tbl->ib_rwq_ind_tbl; + +err_copy: + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); +err: + kfree(rwq_ind_tbl); + return ERR_PTR(err); +} + +int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); + struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); + + mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + + kfree(rwq_ind_tbl); + return 0; +} + +int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(wq->device); + struct mlx5_ib_rwq *rwq = to_mrwq(wq); + struct mlx5_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + int curr_wq_state; + int wq_state; + int inlen; + int err; + void *rqc; + void *in; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + + MLX5_SET(modify_rq_in, in, rqn, rwq->core_qp.qpn); + curr_wq_state = (wq_attr_mask & IB_WQ_CUR_STATE) ? + wq_attr->curr_wq_state : wq->state; + wq_state = (wq_attr_mask & IB_WQ_STATE) ? + wq_attr->wq_state : curr_wq_state; + if (curr_wq_state == IB_WQS_ERR) + curr_wq_state = MLX5_RQC_STATE_ERR; + if (wq_state == IB_WQS_ERR) + wq_state = MLX5_RQC_STATE_ERR; + MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(rqc, rqc, state, wq_state); + + err = mlx5_core_modify_rq(dev->mdev, in, inlen); + kvfree(in); + if (!err) + rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; + + return err; +} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c deleted file mode 100644 index f7b17c561aef..000000000000 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_roce.c +++ /dev/null @@ -1,252 +0,0 @@ -/*- - * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#include -#include -#include -#include -#include -#include -#include -#include "mlx5_ib.h" - -struct net_device *mlx5_ib_get_netdev(struct ib_device *ib_dev, u8 port) -{ - struct mlx5_ib_dev *dev = to_mdev(ib_dev); - - return mlx5_get_protocol_dev(dev->mdev, MLX5_INTERFACE_PROTOCOL_ETH); -} - - -static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, - struct net_device *ndev, - void *mlx5_addr) -{ -#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) - char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, - source_l3_address); - void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, - source_mac_47_32); - union ib_gid zgid; - u16 vtag; - - memset(&zgid, 0, sizeof(zgid)); - if (0 == memcmp(gid, &zgid, sizeof(zgid))) - return; - - ether_addr_copy(mlx5_addr_mac, IF_LLADDR(ndev)); - - if (VLAN_TAG(ndev, &vtag) == 0) { - MLX5_SET_RA(mlx5_addr, vlan_valid, 1); - MLX5_SET_RA(mlx5_addr, vlan_id, vtag); - } - -#ifndef MLX5_USE_ROCE_VERSION_2 - MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); - - memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); -#else - MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); - - if (ipv6_addr_v4mapped((void *)gid)) { - MLX5_SET_RA(mlx5_addr, roce_l3_type, - MLX5_ROCE_L3_TYPE_IPV4); - memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); - } else { - MLX5_SET_RA(mlx5_addr, roce_l3_type, - MLX5_ROCE_L3_TYPE_IPV6); - memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); - } -#endif -} - -int modify_gid_roce(struct ib_device *ib_dev, u8 port, unsigned int index, - const union ib_gid *gid, struct net_device *ndev) -{ - struct mlx5_ib_dev *dev = to_mdev(ib_dev); - u32 in[MLX5_ST_SZ_DW(set_roce_address_in)]; - u32 out[MLX5_ST_SZ_DW(set_roce_address_out)]; - void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); - - memset(in, 0, sizeof(in)); - - ib_gid_to_mlx5_roce_addr(gid, ndev, in_addr); - - MLX5_SET(set_roce_address_in, in, roce_address_index, index); - MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); - - memset(out, 0, sizeof(out)); - return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); -} - -static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, - u8 *active_width) -{ - switch (eth_proto_oper) { - case MLX5_PROT_MASK(MLX5_1000BASE_CX_SGMII): - case MLX5_PROT_MASK(MLX5_1000BASE_KX): - case MLX5_PROT_MASK(MLX5_100BASE_TX): - case MLX5_PROT_MASK(MLX5_1000BASE_T): - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_SDR; - break; - case MLX5_PROT_MASK(MLX5_10GBASE_T): - case MLX5_PROT_MASK(MLX5_10GBASE_CX4): - case MLX5_PROT_MASK(MLX5_10GBASE_KX4): - case MLX5_PROT_MASK(MLX5_10GBASE_KR): - case MLX5_PROT_MASK(MLX5_10GBASE_CR): - case MLX5_PROT_MASK(MLX5_10GBASE_SR): - case MLX5_PROT_MASK(MLX5_10GBASE_ER): - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_QDR; - break; - case MLX5_PROT_MASK(MLX5_25GBASE_CR): - case MLX5_PROT_MASK(MLX5_25GBASE_KR): - case MLX5_PROT_MASK(MLX5_25GBASE_SR): - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_EDR; - break; - case MLX5_PROT_MASK(MLX5_40GBASE_CR4): - case MLX5_PROT_MASK(MLX5_40GBASE_KR4): - case MLX5_PROT_MASK(MLX5_40GBASE_SR4): - case MLX5_PROT_MASK(MLX5_40GBASE_LR4): - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_QDR; - break; - case MLX5_PROT_MASK(MLX5_50GBASE_CR2): - case MLX5_PROT_MASK(MLX5_50GBASE_KR2): - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR; - break; - case MLX5_PROT_MASK(MLX5_56GBASE_R4): - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR; - break; - case MLX5_PROT_MASK(MLX5_100GBASE_CR4): - case MLX5_PROT_MASK(MLX5_100GBASE_SR4): - case MLX5_PROT_MASK(MLX5_100GBASE_KR4): - case MLX5_PROT_MASK(MLX5_100GBASE_LR4): - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - break; - default: - return -EINVAL; - } - - return 0; -} - -static int mlx5_query_roce_port_ptys(struct ib_device *ib_dev, - struct ib_port_attr *props, u8 port) -{ - struct mlx5_ib_dev *dev = to_mdev(ib_dev); - struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ptys_reg *ptys; - int err; - - ptys = kzalloc(sizeof(*ptys), GFP_KERNEL); - if (!ptys) - return -ENOMEM; - - ptys->proto_mask |= MLX5_PTYS_EN; - ptys->local_port = port; - - err = mlx5_core_access_ptys(mdev, ptys, 0); - if (err) - goto out; - - err = translate_eth_proto_oper(ptys->eth_proto_oper, - &props->active_speed, - &props->active_width); -out: - kfree(ptys); - return err; -} - -int mlx5_query_port_roce(struct ib_device *ib_dev, u8 port, - struct ib_port_attr *props) -{ - struct net_device *netdev = mlx5_ib_get_netdev(ib_dev, port); - struct mlx5_ib_dev *dev = to_mdev(ib_dev); - enum ib_mtu netdev_ib_mtu; - - memset(props, 0, sizeof(*props)); - - props->port_cap_flags |= IB_PORT_CM_SUP; - - props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, - roce_address_table_size); - props->max_mtu = IB_MTU_4096; - props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); - props->pkey_tbl_len = 1; - props->state = IB_PORT_DOWN; - props->phys_state = 3; - - if (mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, - (u16 *)&props->qkey_viol_cntr)) - printf("mlx5_ib: WARN: ""%s failed to query qkey violations counter\n", __func__); - - - if (!netdev) - return 0; - - if (netif_running(netdev) && netif_carrier_ok(netdev)) { - props->state = IB_PORT_ACTIVE; - props->phys_state = 5; - } - - netdev_ib_mtu = iboe_get_mtu(netdev->if_mtu); - props->active_mtu = min(props->max_mtu, netdev_ib_mtu); - - mlx5_query_roce_port_ptys(ib_dev, props, port); - - return 0; -} - -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port, - int index, __be16 ah_s_udp_port) -{ -#ifndef MLX5_USE_ROCE_VERSION_2 - return 0; -#else - return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); -#endif -} - -int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port, - int index, int *gid_type) -{ - union ib_gid gid; - int ret; - - ret = ib_get_cached_gid(&dev->ib_dev, port, index, &gid); - - if (!ret) - *gid_type = -1; - - return ret; -} diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c index 93dec9c040f9..c0b2ec1e52b6 100644 --- a/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_srq.c @@ -33,7 +33,6 @@ #include #include "mlx5_ib.h" -#include "user.h" /* not supported currently */ static int srq_signature; @@ -59,7 +58,8 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, int type) event.event = IB_EVENT_SRQ_ERR; break; default: - printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on SRQ %06x\n", type, srq->srqn); + pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n", + type, srq->srqn); return; } @@ -69,31 +69,39 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, int type) static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, - struct ib_udata *udata, int buf_size, int *inlen) + struct ib_udata *udata, int buf_size, int *inlen, + int type) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_ib_create_srq ucmd; + struct mlx5_ib_create_srq ucmd = {}; size_t ucmdlen; - void *xsrqc; int err; int npages; int page_shift; int ncont; - int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); u32 offset; + u32 uidx = MLX5_IB_DEFAULT_UIDX; - ucmdlen = (drv_data < sizeof(ucmd)) ? - drv_data : sizeof(ucmd); + ucmdlen = min(udata->inlen, sizeof(ucmd)); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { - mlx5_ib_err(dev, "failed copy udata\n"); + mlx5_ib_dbg(dev, "failed copy udata\n"); return -EFAULT; } - if (ucmdlen == sizeof(ucmd) && - ucmd.reserved1 != 0) { - mlx5_ib_warn(dev, "corrupted ucmd\n"); + if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EINVAL; + + if (type == IB_SRQT_XRC) { + err = get_srq_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; } srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); @@ -101,7 +109,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, 0, 0); if (IS_ERR(srq->umem)) { - mlx5_ib_warn(dev, "failed umem get, size %d\n", buf_size); + mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); err = PTR_ERR(srq->umem); return err; } @@ -118,7 +126,6 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; *in = mlx5_vzalloc(*inlen); if (!(*in)) { - mlx5_ib_err(dev, "failed allocate mbox\n"); err = -ENOMEM; goto err_umem; } @@ -128,21 +135,18 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &srq->db); if (err) { - mlx5_ib_warn(dev, "map doorbell failed\n"); + mlx5_ib_dbg(dev, "map doorbell failed\n"); goto err_in; } (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); - if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { - xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, - xrc_srq_context_entry); - /* 0xffffff means we ask to work with cqe version 0 */ - if (drv_data > offsetof(struct mlx5_ib_create_srq, uidx)) - MLX5_SET(xrc_srqc, xsrqc, user_index, ucmd.uidx); - else - MLX5_SET(xrc_srqc, xsrqc, user_index, 0xffffff); + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + type == IB_SRQT_XRC) { + void *xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xsrqc, user_index, uidx); } return 0; @@ -158,13 +162,13 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, int buf_size, - int *inlen) + int *inlen, int type) { int err; int i; struct mlx5_wqe_srq_next_seg *next; int page_shift; - void *xsrqc; + int npages; err = mlx5_db_alloc(dev->mdev, &srq->db); if (err) { @@ -172,8 +176,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, return err; } - if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) { - mlx5_ib_err(dev, "buf alloc failed\n"); + if (mlx5_buf_alloc(dev->mdev, buf_size, 2 * PAGE_SIZE, &srq->buf)) { + mlx5_ib_dbg(dev, "buf alloc failed\n"); err = -ENOMEM; goto err_db; } @@ -189,10 +193,12 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, cpu_to_be16((i + 1) & (srq->msrq.max - 1)); } - *inlen = sizeof(**in) + sizeof(*(*in)->pas) * srq->buf.npages; + npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); + mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", + buf_size, page_shift, srq->buf.npages, npages); + *inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages; *in = mlx5_vzalloc(*inlen); if (!*in) { - mlx5_ib_err(dev, "failed allocate mbox\n"); err = -ENOMEM; goto err_buf; } @@ -200,6 +206,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL); if (!srq->wrid) { + mlx5_ib_dbg(dev, "kmalloc failed %lu\n", + (unsigned long)(srq->msrq.max * sizeof(u64))); err = -ENOMEM; goto err_in; } @@ -207,11 +215,11 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; - if (MLX5_CAP_GEN(dev->mdev, cqe_version)) { - xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, - xrc_srq_context_entry); - /* 0xffffff means we ask to work with cqe version 0 */ - MLX5_SET(xrc_srqc, xsrqc, user_index, 0xffffff); + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && + type == IB_SRQT_XRC) { + void *xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX); } return 0; @@ -258,9 +266,9 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= max_srq_wqes) { - mlx5_ib_warn(dev, "max_wr %d, cap %d\n", - init_attr->attr.max_wr, - max_srq_wqes); + mlx5_ib_dbg(dev, "max_wr %d, cap %d\n", + init_attr->attr.max_wr, + max_srq_wqes); return ERR_PTR(-EINVAL); } @@ -286,9 +294,9 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, srq->msrq.max_avail_gather); if (pd->uobject) - err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen, init_attr->srq_type); else - err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen, init_attr->srq_type); if (err) { mlx5_ib_warn(dev, "create srq %s failed, err %d\n", @@ -315,7 +323,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen, is_xrc); kvfree(in); if (err) { - mlx5_ib_warn(dev, "create SRQ failed, err %d\n", err); + mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err); goto err_usr_kern_srq; } @@ -326,7 +334,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, if (pd->uobject) if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { - mlx5_ib_err(dev, "copy to user failed\n"); + mlx5_ib_dbg(dev, "copy to user failed\n"); err = -EFAULT; goto err_core; } @@ -450,7 +458,6 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; *bad_wr = wr; - nreq = 0; goto out; } diff --git a/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c b/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c new file mode 100644 index 000000000000..6088fb48bb5f --- /dev/null +++ b/sys/dev/mlx5/mlx5_ib/mlx5_ib_virt.c @@ -0,0 +1,54 @@ +/*- + * Copyright (c) 2016, Mellanox Technologies, Ltd. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include "mlx5_ib.h" + +int mlx5_ib_get_vf_config(struct ib_device *device, int vf, u8 port, + struct ifla_vf_info *info) +{ + return -EOPNOTSUPP; +} + +int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, + u8 port, int state) +{ + return -EOPNOTSUPP; +} + +int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, + u8 port, struct ifla_vf_stats *stats) +{ + return -EOPNOTSUPP; +} + +int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, + u64 guid, int type) +{ + return -EOPNOTSUPP; +} diff --git a/sys/dev/mlx5/mlx5_ib/user.h b/sys/dev/mlx5/mlx5_ib/user.h deleted file mode 100644 index bc55952312a8..000000000000 --- a/sys/dev/mlx5/mlx5_ib/user.h +++ /dev/null @@ -1,318 +0,0 @@ -/*- - * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef MLX5_IB_USER_H -#define MLX5_IB_USER_H - -#include - -enum { - MLX5_QP_FLAG_SIGNATURE = 1 << 0, -}; - -enum { - MLX5_SRQ_FLAG_SIGNATURE = 1 << 0, -}; - -enum { - MLX5_WQ_FLAG_SIGNATURE = 1 << 0, -}; - - -/* Increment this value if any changes that break userspace ABI - * compatibility are made. - */ -#define MLX5_IB_UVERBS_ABI_VERSION 1 - -/* Make sure that all structs defined in this file remain laid out so - * that they pack the same way on 32-bit and 64-bit architectures (to - * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * In particular do not use pointer types -- pass pointers in __u64 - * instead. - */ - -struct mlx5_ib_alloc_ucontext_req { - __u32 total_num_uuars; - __u32 num_low_latency_uuars; -}; - -struct mlx5_ib_alloc_ucontext_req_v2 { - __u32 total_num_uuars; - __u32 num_low_latency_uuars; - __u32 flags; - __u32 reserved; -}; - -struct mlx5_ib_alloc_ucontext_resp { - __u32 qp_tab_size; - __u32 bf_reg_size; - __u32 tot_uuars; - __u32 cache_line_size; - __u16 max_sq_desc_sz; - __u16 max_rq_desc_sz; - __u32 max_send_wqebb; - __u32 max_recv_wr; - __u32 max_srq_recv_wr; - __u16 num_ports; - __u16 reserved; - __u32 max_desc_sz_sq_dc; - __u32 atomic_arg_sizes_dc; - __u32 reserved1; - __u32 flags; - __u32 reserved2[5]; -}; - -enum mlx5_exp_ib_alloc_ucontext_data_resp_mask { - MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_COMP_MAX_NUM = 1 << 0, - MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_VERSION = 1 << 1, - MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MIN = 1 << 2, - MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MAX = 1 << 3, - MLX5_EXP_ALLOC_CTX_RESP_MASK_HCA_CORE_CLOCK_OFFSET = 1 << 4, -}; - -struct mlx5_exp_ib_alloc_ucontext_data_resp { - __u32 comp_mask; /* use mlx5_ib_exp_alloc_ucontext_data_resp_mask */ - __u16 cqe_comp_max_num; - __u8 cqe_version; - __u8 reserved; - __u16 rroce_udp_sport_min; - __u16 rroce_udp_sport_max; - __u32 hca_core_clock_offset; -}; - -struct mlx5_exp_ib_alloc_ucontext_resp { - __u32 qp_tab_size; - __u32 bf_reg_size; - __u32 tot_uuars; - __u32 cache_line_size; - __u16 max_sq_desc_sz; - __u16 max_rq_desc_sz; - __u32 max_send_wqebb; - __u32 max_recv_wr; - __u32 max_srq_recv_wr; - __u16 num_ports; - __u16 reserved; - __u32 max_desc_sz_sq_dc; - __u32 atomic_arg_sizes_dc; - __u32 reserved1; - __u32 flags; - __u32 reserved2[5]; - /* Some more reserved fields for - * future growth of mlx5_ib_alloc_ucontext_resp */ - __u64 prefix_reserved[8]; - struct mlx5_exp_ib_alloc_ucontext_data_resp exp_data; -}; - -struct mlx5_ib_alloc_pd_resp { - __u32 pdn; -}; - -struct mlx5_ib_create_cq { - __u64 buf_addr; - __u64 db_addr; - __u32 cqe_size; - __u32 reserved; /* explicit padding (optional on i386) */ -}; - -enum mlx5_exp_ib_create_cq_mask { - MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_EN = 1 << 0, - MLX5_EXP_CREATE_CQ_MASK_CQE_COMP_RECV_TYPE = 1 << 1, - MLX5_EXP_CREATE_CQ_MASK_RESERVED = 1 << 2, -}; - -enum mlx5_exp_cqe_comp_recv_type { - MLX5_IB_CQE_FORMAT_HASH, - MLX5_IB_CQE_FORMAT_CSUM, -}; - -struct mlx5_exp_ib_create_cq_data { - __u32 comp_mask; /* use mlx5_exp_ib_creaet_cq_mask */ - __u8 cqe_comp_en; - __u8 cqe_comp_recv_type; /* use mlx5_exp_cqe_comp_recv_type */ - __u16 reserved; -}; - -struct mlx5_exp_ib_create_cq { - __u64 buf_addr; - __u64 db_addr; - __u32 cqe_size; - __u32 reserved; /* explicit padding (optional on i386) */ - - /* Some more reserved fields for future growth of mlx5_ib_create_cq */ - __u64 prefix_reserved[8]; - - /* sizeof prefix aligned with mlx5_ib_create_cq */ - __u64 size_of_prefix; - struct mlx5_exp_ib_create_cq_data exp_data; -}; - -struct mlx5_ib_create_cq_resp { - __u32 cqn; - __u32 reserved; -}; - -struct mlx5_ib_resize_cq { - __u64 buf_addr; - __u16 cqe_size; - __u16 reserved0; - __u32 reserved1; -}; - -struct mlx5_ib_create_srq { - __u64 buf_addr; - __u64 db_addr; - __u32 flags; - __u32 reserved; /* explicit padding (optional on i386) */ - __u32 uidx; - __u32 reserved1; -}; - -struct mlx5_ib_create_srq_resp { - __u32 srqn; - __u32 reserved; -}; - -struct mlx5_ib_create_qp { - __u64 buf_addr; - __u64 db_addr; - __u32 sq_wqe_count; - __u32 rq_wqe_count; - __u32 rq_wqe_shift; - __u32 flags; -}; - -enum mlx5_exp_ib_create_qp_mask { - MLX5_EXP_CREATE_QP_MASK_UIDX = 1 << 0, - MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD = 1 << 1, - MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX = 1 << 2, - MLX5_EXP_CREATE_QP_MASK_FLAGS_IDX = 1 << 3, - MLX5_EXP_CREATE_QP_MASK_RESERVED = 1 << 4, -}; - -enum mlx5_exp_create_qp_flags { - MLX5_EXP_CREATE_QP_MULTI_PACKET_WQE_REQ_FLAG = 1 << 0, -}; - -enum mlx5_exp_drv_create_qp_uar_idx { - MLX5_EXP_CREATE_QP_DB_ONLY_UUAR = -1 -}; - -struct mlx5_exp_ib_create_qp_data { - __u32 comp_mask; /* use mlx5_exp_ib_create_qp_mask */ - __u32 uidx; - __u64 sq_buf_addr; - __u32 wc_uar_index; - __u32 flags; /* use mlx5_exp_create_qp_flags */ -}; - -struct mlx5_exp_ib_create_qp { - /* To allow casting to mlx5_ib_create_qp the prefix is the same as - * struct mlx5_ib_create_qp prefix - */ - __u64 buf_addr; - __u64 db_addr; - __u32 sq_wqe_count; - __u32 rq_wqe_count; - __u32 rq_wqe_shift; - __u32 flags; - - /* Some more reserved fields for future growth of mlx5_ib_create_qp */ - __u64 prefix_reserved[8]; - - /* sizeof prefix aligned with mlx5_ib_create_qp */ - __u64 size_of_prefix; - - /* Experimental data - * Add new experimental data only inside the exp struct - */ - struct mlx5_exp_ib_create_qp_data exp; -}; - -enum { - MLX5_EXP_INVALID_UUAR = -1, -}; - -struct mlx5_ib_create_qp_resp { - __u32 uuar_index; - __u32 rsvd; -}; - -enum mlx5_exp_ib_create_qp_resp_mask { - MLX5_EXP_CREATE_QP_RESP_MASK_FLAGS_IDX = 1 << 0, - MLX5_EXP_CREATE_QP_RESP_MASK_RESERVED = 1 << 1, -}; - -enum mlx5_exp_create_qp_resp_flags { - MLX5_EXP_CREATE_QP_RESP_MULTI_PACKET_WQE_FLAG = 1 << 0, -}; - -struct mlx5_exp_ib_create_qp_resp_data { - __u32 comp_mask; /* use mlx5_exp_ib_create_qp_resp_mask */ - __u32 flags; /* use mlx5_exp_create_qp_resp_flags */ -}; - -struct mlx5_exp_ib_create_qp_resp { - __u32 uuar_index; - __u32 rsvd; - - /* Some more reserved fields for future growth of mlx5_ib_create_qp_resp */ - __u64 prefix_reserved[8]; - - /* sizeof prefix aligned with mlx5_ib_create_qp_resp */ - __u64 size_of_prefix; - - /* Experimental data - * Add new experimental data only inside the exp struct - */ - struct mlx5_exp_ib_create_qp_resp_data exp; -}; - -struct mlx5_ib_create_dct { - __u32 uidx; - __u32 reserved; -}; - -struct mlx5_ib_arm_dct { - __u64 reserved0; - __u64 reserved1; -}; - -struct mlx5_ib_arm_dct_resp { - __u64 reserved0; - __u64 reserved1; -}; - -struct mlx5_ib_create_wq { - __u64 buf_addr; - __u64 db_addr; - __u32 rq_wqe_count; - __u32 rq_wqe_shift; - __u32 user_index; - __u32 flags; -}; - -#endif /* MLX5_IB_USER_H */ diff --git a/sys/modules/mlx5ib/Makefile b/sys/modules/mlx5ib/Makefile index bc554ee8ca85..066d27adb3b1 100644 --- a/sys/modules/mlx5ib/Makefile +++ b/sys/modules/mlx5ib/Makefile @@ -6,18 +6,22 @@ SRCS= \ mlx5_ib_ah.c \ mlx5_ib_cq.c \ mlx5_ib_doorbell.c \ +mlx5_ib_gsi.c \ mlx5_ib_mad.c \ mlx5_ib_main.c \ mlx5_ib_mem.c \ mlx5_ib_mr.c \ mlx5_ib_qp.c \ -mlx5_ib_roce.c \ mlx5_ib_srq.c \ +mlx5_ib_virt.c \ device_if.h bus_if.h vnode_if.h pci_if.h \ opt_inet.h opt_inet6.h CFLAGS+= -I${SRCTOP}/sys/ofed/include +CFLAGS+= -I${SRCTOP}/sys/ofed/include/uapi CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include +CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM +CFLAGS+= -DINET -DINET6 .include