Branch data Line data Source code
1 : : /* SPDX-License-Identifier: BSD-3-Clause
2 : : * Copyright(C) 2023 Marvell.
3 : : */
4 : :
5 : : #include "cnxk_ep_rx.h"
6 : :
7 : : static __rte_always_inline uint32_t
8 : : hadd(__m128i x)
9 : : {
10 : : __m128i hi64 = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2));
11 : : __m128i sum64 = _mm_add_epi32(hi64, x);
12 : : __m128i hi32 = _mm_shufflelo_epi16(sum64, _MM_SHUFFLE(1, 0, 3, 2));
13 : : __m128i sum32 = _mm_add_epi32(sum64, hi32);
14 : 0 : return _mm_cvtsi128_si32(sum32);
15 : : }
16 : :
17 : : static __rte_always_inline void
18 : : cnxk_ep_process_pkts_vec_sse(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
19 : : {
20 : 0 : struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
21 : 0 : uint32_t read_idx = droq->read_idx;
22 : : struct rte_mbuf *m0, *m1, *m2, *m3;
23 : 0 : uint16_t nb_desc = droq->nb_desc;
24 : : uint32_t idx0, idx1, idx2, idx3;
25 : : uint16_t pkts = 0;
26 : : __m128i bytes;
27 : :
28 : : idx0 = read_idx;
29 : : bytes = _mm_setzero_si128();
30 [ # # # # ]: 0 : while (pkts < new_pkts) {
31 : : const __m128i bswap_mask = _mm_set_epi8(0xFF, 0xFF, 12, 13, 0xFF, 0xFF, 8, 9, 0xFF,
32 : : 0xFF, 4, 5, 0xFF, 0xFF, 0, 1);
33 : : const __m128i cpy_mask = _mm_set_epi8(0xFF, 0xFF, 9, 8, 0xFF, 0xFF, 9, 8, 0xFF,
34 : : 0xFF, 1, 0, 0xFF, 0xFF, 1, 0);
35 : : __m128i s01, s23;
36 : :
37 : : idx1 = otx_ep_incr_index(idx0, 1, nb_desc);
38 : : idx2 = otx_ep_incr_index(idx1, 1, nb_desc);
39 : : idx3 = otx_ep_incr_index(idx2, 1, nb_desc);
40 : :
41 : 0 : m0 = recv_buf_list[idx0];
42 : 0 : m1 = recv_buf_list[idx1];
43 : 0 : m2 = recv_buf_list[idx2];
44 : 0 : m3 = recv_buf_list[idx3];
45 : :
46 : : /* Load packet size big-endian. */
47 : 0 : s01 = _mm_set_epi32(cnxk_pktmbuf_mtod(m3, struct otx_ep_droq_info *)->length >> 48,
48 : 0 : cnxk_pktmbuf_mtod(m1, struct otx_ep_droq_info *)->length >> 48,
49 : 0 : cnxk_pktmbuf_mtod(m2, struct otx_ep_droq_info *)->length >> 48,
50 : 0 : cnxk_pktmbuf_mtod(m0, struct otx_ep_droq_info *)->length >> 48);
51 : : /* Convert to little-endian. */
52 : : s01 = _mm_shuffle_epi8(s01, bswap_mask);
53 : : /* Vertical add, consolidate outside loop */
54 : : bytes = _mm_add_epi32(bytes, s01);
55 : : /* Segregate to packet length and data length. */
56 : : s23 = _mm_shuffle_epi32(s01, _MM_SHUFFLE(3, 3, 1, 1));
57 : : s01 = _mm_shuffle_epi8(s01, cpy_mask);
58 : : s23 = _mm_shuffle_epi8(s23, cpy_mask);
59 : :
60 : : /* Store packet length and data length to mbuf. */
61 : 0 : *(uint64_t *)&m0->pkt_len = ((rte_xmm_t)s01).u64[0];
62 : 0 : *(uint64_t *)&m1->pkt_len = ((rte_xmm_t)s01).u64[1];
63 : 0 : *(uint64_t *)&m2->pkt_len = ((rte_xmm_t)s23).u64[0];
64 : 0 : *(uint64_t *)&m3->pkt_len = ((rte_xmm_t)s23).u64[1];
65 : :
66 : : /* Reset rearm data. */
67 : 0 : *(uint64_t *)&m0->rearm_data = droq->rearm_data;
68 : 0 : *(uint64_t *)&m1->rearm_data = droq->rearm_data;
69 : 0 : *(uint64_t *)&m2->rearm_data = droq->rearm_data;
70 : 0 : *(uint64_t *)&m3->rearm_data = droq->rearm_data;
71 : :
72 : 0 : rx_pkts[pkts++] = m0;
73 : 0 : rx_pkts[pkts++] = m1;
74 : 0 : rx_pkts[pkts++] = m2;
75 : 0 : rx_pkts[pkts++] = m3;
76 : : idx0 = otx_ep_incr_index(idx3, 1, nb_desc);
77 : : }
78 : 0 : droq->read_idx = idx0;
79 : :
80 : 0 : droq->refill_count += new_pkts;
81 : 0 : droq->pkts_pending -= new_pkts;
82 : : /* Stats */
83 : 0 : droq->stats.pkts_received += new_pkts;
84 : 0 : droq->stats.bytes_received += hadd(bytes);
85 : : }
86 : :
87 : : uint16_t __rte_noinline __rte_hot
88 : 0 : cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
89 : : {
90 : : struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
91 : : uint16_t new_pkts, vpkts;
92 : :
93 : : /* Refill RX buffers */
94 [ # # ]: 0 : if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
95 : 0 : cnxk_ep_rx_refill(droq);
96 : :
97 : 0 : new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
98 : 0 : vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
99 : : cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
100 : 0 : cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
101 : :
102 : 0 : return new_pkts;
103 : : }
104 : :
105 : : uint16_t __rte_noinline __rte_hot
106 : 0 : cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
107 : : {
108 : : struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
109 : : uint16_t new_pkts, vpkts;
110 : :
111 : : /* Refill RX buffers */
112 [ # # ]: 0 : if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
113 : 0 : cnxk_ep_rx_refill(droq);
114 : : } else {
115 : : /* SDP output goes into DROP state when output doorbell count
116 : : * goes below drop count. When door bell count is written with
117 : : * a value greater than drop count SDP output should come out
118 : : * of DROP state. Due to a race condition this is not happening.
119 : : * Writing doorbell register with 0 again may make SDP output
120 : : * come out of this state.
121 : : */
122 : :
123 : 0 : rte_write32(0, droq->pkts_credit_reg);
124 : : }
125 : :
126 : 0 : new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
127 : 0 : vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_SSE);
128 : : cnxk_ep_process_pkts_vec_sse(rx_pkts, droq, vpkts);
129 : 0 : cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
130 : :
131 : 0 : return new_pkts;
132 : : }
|