diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 9193dfb2372..350d0836010 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1692,7 +1692,6 @@ in_pcbrele_rlocked(struct inpcb *inp) MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); - MPASS(inp->inp_in_hpts == 0); crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; @@ -1713,7 +1712,6 @@ in_pcbrele_wlocked(struct inpcb *inp) MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); - MPASS(inp->inp_in_hpts == 0); crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 984cb9e2656..62c5758268a 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -145,7 +145,6 @@ struct in_conninfo { * lock is to be obtained and SMR section exited. * * Key: - * (b) - Protected by the hpts lock. * (c) - Constant after initialization * (e) - Protected by the SMR section * (i) - Protected by the inpcb lock @@ -154,51 +153,6 @@ struct in_conninfo { * (s) - Protected by another subsystem's locks * (x) - Undefined locking * - * Notes on the tcp_hpts: - * - * First Hpts lock order is - * 1) INP_WLOCK() - * 2) HPTS_LOCK() i.e. hpts->pmtx - * - * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). - * You may check the inp->inp_in_hpts flag without the hpts lock. - * The hpts is the only one that will clear this flag holding - * only the hpts lock. This means that in your tcp_output() - * routine when you test for the inp_in_hpts flag to be 1 - * it may be transitioning to 0 (by the hpts). - * That's ok since that will just mean an extra call to tcp_output - * that most likely will find the call you executed - * (when the mis-match occurred) will have put the TCB back - * on the hpts and it will return. If your - * call did not add the inp back to the hpts then you will either - * over-send or the cwnd will block you from sending more. - * - * Note you should also be holding the INP_WLOCK() when you - * call the remove from the hpts as well. Though usually - * you are either doing this from a timer, where you need and have - * the INP_WLOCK() or from destroying your TCB where again - * you should already have the INP_WLOCK(). - * - * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and - * inp_input_cpu_set fields are controlled completely by - * the hpts. Do not ever set these. The inp_hpts_cpu_set - * and inp_input_cpu_set fields indicate if the hpts has - * setup the respective cpu field. It is advised if this - * field is 0, to enqueue the packet with the appropriate - * hpts_immediate() call. If the _set field is 1, then - * you may compare the inp_*_cpu field to the curcpu and - * may want to again insert onto the hpts if these fields - * are not equal (i.e. you are not on the expected CPU). - * - * A note on inp_hpts_calls and inp_input_calls, these - * flags are set when the hpts calls either the output - * or do_segment routines respectively. If the routine - * being called wants to use this, then it needs to - * clear the flag before returning. The hpts will not - * clear the flag. The flags can be used to tell if - * the hpts is the function calling the respective - * routine. - * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write @@ -219,41 +173,15 @@ struct inpcb { CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ -#define inp_start_zero inp_hpts +#define inp_start_zero inp_refcount #define inp_zero_size (sizeof(struct inpcb) - \ offsetof(struct inpcb, inp_start_zero)) - TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */ - uint32_t inp_hpts_gencnt; /* XXXGL */ - uint32_t inp_hpts_request; /* Current hpts request, zero if - * fits in the pacing window (i&b). */ - /* - * Note the next fields are protected by a - * different lock (hpts-lock). This means that - * they must correspond in size to the smallest - * protectable bit field (uint8_t on x86, and - * other platfomrs potentially uint32_t?). Also - * since CPU switches can occur at different times the two - * fields can *not* be collapsed into a signal bit field. - */ -#if defined(__amd64__) || defined(__i386__) - uint8_t inp_in_hpts; /* on output hpts (lock b) */ -#else - uint32_t inp_in_hpts; /* on output hpts (lock b) */ -#endif - volatile uint16_t inp_hpts_cpu; /* Lock (i) */ - volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */ u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ - uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */ - inp_hpts_calls :1, /* (i) from output hpts */ - inp_irq_cpu_set :1, /* (i) from LRO/Driver */ - inp_spare_bits2 : 3; uint8_t inp_numa_domain; /* numa domain */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ - int32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */ - uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c index cc1bd71d0d4..59122bb242b 100644 --- a/sys/netinet/tcp_hpts.c +++ b/sys/netinet/tcp_hpts.c @@ -199,7 +199,7 @@ struct tcp_hpts_entry { uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ struct hptsh { - TAILQ_HEAD(, inpcb) head; + TAILQ_HEAD(, tcpcb) head; uint32_t count; uint32_t gencnt; } *p_hptss; /* Hptsi wheel */ @@ -273,12 +273,6 @@ static struct hpts_domain_info { int cpu[MAXCPU]; } hpts_domains[MAXMEMDOM]; -enum { - IHPTS_NONE = 0, - IHPTS_ONQUEUE, - IHPTS_MOVING, -}; - counter_u64_t hpts_hopelessly_behind; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD, @@ -426,6 +420,17 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW, &tcp_hpts_no_wake_over_thresh, 0, "When we are over the threshold on the pacer do we prohibit wakeups?"); +static uint16_t +hpts_random_cpu(void) +{ + uint16_t cpuid; + uint32_t ran; + + ran = arc4random(); + cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); + return (cpuid); +} + static void tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, int slots_to_run, int idx, int from_callout) @@ -489,54 +494,67 @@ hpts_timeout_swi(void *arg) } static void -inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts) +tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts) { + struct inpcb *inp = tptoinpcb(tp); struct hptsh *hptsh; INP_WLOCK_ASSERT(inp); HPTS_MTX_ASSERT(hpts); - MPASS(hpts->p_cpu == inp->inp_hpts_cpu); + MPASS(hpts->p_cpu == tp->t_hpts_cpu); MPASS(!(inp->inp_flags & INP_DROPPED)); - hptsh = &hpts->p_hptss[inp->inp_hptsslot]; + hptsh = &hpts->p_hptss[tp->t_hpts_slot]; - if (inp->inp_in_hpts == IHPTS_NONE) { - inp->inp_in_hpts = IHPTS_ONQUEUE; + if (tp->t_in_hpts == IHPTS_NONE) { + tp->t_in_hpts = IHPTS_ONQUEUE; in_pcbref(inp); - } else if (inp->inp_in_hpts == IHPTS_MOVING) { - inp->inp_in_hpts = IHPTS_ONQUEUE; + } else if (tp->t_in_hpts == IHPTS_MOVING) { + tp->t_in_hpts = IHPTS_ONQUEUE; } else - MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE); - inp->inp_hpts_gencnt = hptsh->gencnt; + MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); + tp->t_hpts_gencnt = hptsh->gencnt; - TAILQ_INSERT_TAIL(&hptsh->head, inp, inp_hpts); + TAILQ_INSERT_TAIL(&hptsh->head, tp, t_hpts); hptsh->count++; hpts->p_on_queue_cnt++; } static struct tcp_hpts_entry * -tcp_hpts_lock(struct inpcb *inp) +tcp_hpts_lock(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; - INP_LOCK_ASSERT(inp); + INP_LOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_pace.rp_ent[inp->inp_hpts_cpu]; + hpts = tcp_pace.rp_ent[tp->t_hpts_cpu]; HPTS_LOCK(hpts); return (hpts); } static void -inp_hpts_release(struct inpcb *inp) +tcp_hpts_release(struct tcpcb *tp) { bool released __diagused; - inp->inp_in_hpts = IHPTS_NONE; - released = in_pcbrele_wlocked(inp); + tp->t_in_hpts = IHPTS_NONE; + released = in_pcbrele_wlocked(tptoinpcb(tp)); MPASS(released == false); } +/* + * Initialize newborn tcpcb to get ready for use with HPTS. + */ +void +tcp_hpts_init(struct tcpcb *tp) +{ + + tp->t_hpts_cpu = hpts_random_cpu(); + tp->t_lro_cpu = HPTS_CPU_NONE; + MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET)); +} + /* * Called normally with the INP_LOCKED but it * does not matter, the hpts lock is the key @@ -544,39 +562,39 @@ inp_hpts_release(struct inpcb *inp) * INP lock and then get the hpts lock. */ void -tcp_hpts_remove(struct inpcb *inp) +tcp_hpts_remove(struct tcpcb *tp) { struct tcp_hpts_entry *hpts; struct hptsh *hptsh; - INP_WLOCK_ASSERT(inp); + INP_WLOCK_ASSERT(tptoinpcb(tp)); - hpts = tcp_hpts_lock(inp); - if (inp->inp_in_hpts == IHPTS_ONQUEUE) { - hptsh = &hpts->p_hptss[inp->inp_hptsslot]; - inp->inp_hpts_request = 0; - if (__predict_true(inp->inp_hpts_gencnt == hptsh->gencnt)) { - TAILQ_REMOVE(&hptsh->head, inp, inp_hpts); + hpts = tcp_hpts_lock(tp); + if (tp->t_in_hpts == IHPTS_ONQUEUE) { + hptsh = &hpts->p_hptss[tp->t_hpts_slot]; + tp->t_hpts_request = 0; + if (__predict_true(tp->t_hpts_gencnt == hptsh->gencnt)) { + TAILQ_REMOVE(&hptsh->head, tp, t_hpts); MPASS(hptsh->count > 0); hptsh->count--; MPASS(hpts->p_on_queue_cnt > 0); hpts->p_on_queue_cnt--; - inp_hpts_release(inp); + tcp_hpts_release(tp); } else { /* * tcp_hptsi() now owns the TAILQ head of this inp. * Can't TAILQ_REMOVE, just mark it. */ #ifdef INVARIANTS - struct inpcb *tmp; + struct tcpcb *tmp; - TAILQ_FOREACH(tmp, &hptsh->head, inp_hpts) - MPASS(tmp != inp); + TAILQ_FOREACH(tmp, &hptsh->head, t_hpts) + MPASS(tmp != tp); #endif - inp->inp_in_hpts = IHPTS_MOVING; - inp->inp_hptsslot = -1; + tp->t_in_hpts = IHPTS_MOVING; + tp->t_hpts_slot = -1; } - } else if (inp->inp_in_hpts == IHPTS_MOVING) { + } else if (tp->t_in_hpts == IHPTS_MOVING) { /* * Handle a special race condition: * tcp_hptsi() moves inpcb to detached tailq @@ -585,18 +603,11 @@ tcp_hpts_remove(struct inpcb *inp) * tcp_hpts_remove() again (we are here!), then in_pcbdrop() * tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED */ - inp->inp_hptsslot = -1; + tp->t_hpts_slot = -1; } HPTS_UNLOCK(hpts); } -bool -tcp_in_hpts(struct inpcb *inp) -{ - - return (inp->inp_in_hpts == IHPTS_ONQUEUE); -} - static inline int hpts_slot(uint32_t wheel_slot, uint32_t plus) { @@ -762,15 +773,15 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t * #ifdef INVARIANTS static void -check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) +check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp, + uint32_t hptsslot, int line) { /* * Sanity checks for the pacer with invariants * on insert. */ - KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS, - ("hpts:%p inp:%p slot:%d > max", - hpts, inp, inp_hptsslot)); + KASSERT(hptsslot < NUM_OF_HPTSI_SLOTS, + ("hpts:%p tp:%p slot:%d > max", hpts, tp, hptsslot)); if ((hpts->p_hpts_active) && (hpts->p_wheel_complete == 0)) { /* @@ -781,22 +792,21 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uin */ int distance, yet_to_run; - distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot); + distance = hpts_slots_diff(hpts->p_runningslot, hptsslot); if (hpts->p_runningslot != hpts->p_cur_slot) yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot); else yet_to_run = 0; /* processing last slot */ - KASSERT(yet_to_run <= distance, - ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", - hpts, inp, inp_hptsslot, - distance, yet_to_run, - hpts->p_runningslot, hpts->p_cur_slot)); + KASSERT(yet_to_run <= distance, ("hpts:%p tp:%p slot:%d " + "distance:%d yet_to_run:%d rs:%d cs:%d", hpts, tp, + hptsslot, distance, yet_to_run, hpts->p_runningslot, + hpts->p_cur_slot)); } } #endif uint32_t -tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag) +tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag) { struct tcp_hpts_entry *hpts; struct timeval tv; @@ -804,16 +814,16 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts int32_t wheel_slot, maxslots; bool need_wakeup = false; - INP_WLOCK_ASSERT(inp); - MPASS(!tcp_in_hpts(inp)); - MPASS(!(inp->inp_flags & INP_DROPPED)); + INP_WLOCK_ASSERT(tptoinpcb(tp)); + MPASS(!(tptoinpcb(tp)->inp_flags & INP_DROPPED)); + MPASS(!tcp_in_hpts(tp)); /* * We now return the next-slot the hpts will be on, beyond its * current run (if up) or where it was when it stopped if it is * sleeping. */ - hpts = tcp_hpts_lock(inp); + hpts = tcp_hpts_lock(tp); microuptime(&tv); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); @@ -830,20 +840,20 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts } if (slot == 0) { /* Ok we need to set it on the hpts in the current slot */ - inp->inp_hpts_request = 0; + tp->t_hpts_request = 0; if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) { /* * A sleeping hpts we want in next slot to run * note that in this state p_prev_slot == p_cur_slot */ - inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1); + tp->t_hpts_slot = hpts_slot(hpts->p_prev_slot, 1); if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) need_wakeup = true; } else - inp->inp_hptsslot = hpts->p_runningslot; - if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING)) - inp_hpts_insert(inp, hpts); + tp->t_hpts_slot = hpts->p_runningslot; + if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) + tcp_hpts_insert_internal(tp, hpts); if (need_wakeup) { /* * Activate the hpts if it is sleeping and its @@ -880,28 +890,28 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts */ slot--; } - inp->inp_hptsslot = last_slot; - inp->inp_hpts_request = slot; + tp->t_hpts_slot = last_slot; + tp->t_hpts_request = slot; } else if (maxslots >= slot) { /* It all fits on the wheel */ - inp->inp_hpts_request = 0; - inp->inp_hptsslot = hpts_slot(wheel_slot, slot); + tp->t_hpts_request = 0; + tp->t_hpts_slot = hpts_slot(wheel_slot, slot); } else { /* It does not fit */ - inp->inp_hpts_request = slot - maxslots; - inp->inp_hptsslot = last_slot; + tp->t_hpts_request = slot - maxslots; + tp->t_hpts_slot = last_slot; } if (diag) { - diag->slot_remaining = inp->inp_hpts_request; - diag->inp_hptsslot = inp->inp_hptsslot; + diag->slot_remaining = tp->t_hpts_request; + diag->inp_hptsslot = tp->t_hpts_slot; } #ifdef INVARIANTS - check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); + check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line); #endif - if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING)) - inp_hpts_insert(inp, hpts); + if (__predict_true(tp->t_in_hpts != IHPTS_MOVING)) + tcp_hpts_insert_internal(tp, hpts); if ((hpts->p_hpts_active == 0) && - (inp->inp_hpts_request == 0) && + (tp->t_hpts_request == 0) && (hpts->p_on_min_sleep == 0)) { /* * The hpts is sleeping and NOT on a minimum @@ -972,54 +982,35 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts return (slot_on); } -uint16_t -hpts_random_cpu(struct inpcb *inp){ - /* - * No flow type set distribute the load randomly. - */ - uint16_t cpuid; - uint32_t ran; - - /* - * Shortcut if it is already set. XXXGL: does it happen? - */ - if (inp->inp_hpts_cpu_set) { - return (inp->inp_hpts_cpu); - } - /* Nothing set use a random number */ - ran = arc4random(); - cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss); - return (cpuid); -} - static uint16_t -hpts_cpuid(struct inpcb *inp, int *failed) +hpts_cpuid(struct tcpcb *tp, int *failed) { + struct inpcb *inp = tptoinpcb(tp); u_int cpuid; #ifdef NUMA struct hpts_domain_info *di; #endif *failed = 0; - if (inp->inp_hpts_cpu_set) { - return (inp->inp_hpts_cpu); + if (tp->t_flags2 & TF2_HPTS_CPU_SET) { + return (tp->t_hpts_cpu); } /* * If we are using the irq cpu set by LRO or * the driver then it overrides all other domains. */ if (tcp_use_irq_cpu) { - if (inp->inp_irq_cpu_set == 0) { + if (tp->t_lro_cpu == HPTS_CPU_NONE) { *failed = 1; - return(0); + return (0); } - return(inp->inp_irq_cpu); + return (tp->t_lro_cpu); } /* If one is set the other must be the same */ #ifdef RSS cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); if (cpuid == NETISR_CPUID_NONE) - return (hpts_random_cpu(inp)); + return (hpts_random_cpu()); else return (cpuid); #endif @@ -1030,7 +1021,7 @@ hpts_cpuid(struct inpcb *inp, int *failed) */ if (inp->inp_flowtype == M_HASHTYPE_NONE) { counter_u64_add(cpu_uses_random, 1); - return (hpts_random_cpu(inp)); + return (hpts_random_cpu()); } /* * Hash to a thread based on the flowid. If we are using numa, @@ -1081,12 +1072,10 @@ static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) { struct tcpcb *tp; - struct inpcb *inp; struct timeval tv; int32_t slots_to_run, i, error; int32_t loop_cnt = 0; int32_t did_prefetch = 0; - int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; int32_t wrap_loop_cnt = 0; int32_t slot_pos_of_endpoint = 0; @@ -1154,25 +1143,25 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) * run them, the extra 10usecs of late (by being * put behind) does not really matter in this situation. */ - TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot].head, - inp_hpts) { - MPASS(inp->inp_hptsslot == hpts->p_nxt_slot); - MPASS(inp->inp_hpts_gencnt == + TAILQ_FOREACH(tp, &hpts->p_hptss[hpts->p_nxt_slot].head, + t_hpts) { + MPASS(tp->t_hpts_slot == hpts->p_nxt_slot); + MPASS(tp->t_hpts_gencnt == hpts->p_hptss[hpts->p_nxt_slot].gencnt); - MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE); + MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); /* * Update gencnt and nextslot accordingly to match * the new location. This is safe since it takes both * the INP lock and the pacer mutex to change the - * inp_hptsslot and inp_hpts_gencnt. + * t_hptsslot and t_hpts_gencnt. */ - inp->inp_hpts_gencnt = + tp->t_hpts_gencnt = hpts->p_hptss[hpts->p_runningslot].gencnt; - inp->inp_hptsslot = hpts->p_runningslot; + tp->t_hpts_slot = hpts->p_runningslot; } TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head, - &hpts->p_hptss[hpts->p_nxt_slot].head, inp_hpts); + &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts); hpts->p_hptss[hpts->p_runningslot].count += hpts->p_hptss[hpts->p_nxt_slot].count; hpts->p_hptss[hpts->p_nxt_slot].count = 0; @@ -1191,8 +1180,8 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) goto no_one; } for (i = 0; i < slots_to_run; i++) { - struct inpcb *inp, *ninp; - TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head); + struct tcpcb *tp, *ntp; + TAILQ_HEAD(, tcpcb) head = TAILQ_HEAD_INITIALIZER(head); struct hptsh *hptsh; uint32_t runningslot; @@ -1205,161 +1194,25 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) runningslot = hpts->p_runningslot; hptsh = &hpts->p_hptss[runningslot]; - TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts); + TAILQ_SWAP(&head, &hptsh->head, tcpcb, t_hpts); hpts->p_on_queue_cnt -= hptsh->count; hptsh->count = 0; hptsh->gencnt++; HPTS_UNLOCK(hpts); - TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) { + TAILQ_FOREACH_SAFE(tp, &head, t_hpts, ntp) { + struct inpcb *inp = tptoinpcb(tp); bool set_cpu; - if (ninp != NULL) { - /* We prefetch the next inp if possible */ - kern_prefetch(ninp, &prefetch_ninp); - prefetch_ninp = 1; - } - - /* For debugging */ - if (seen_endpoint == 0) { - seen_endpoint = 1; - orig_exit_slot = slot_pos_of_endpoint = - runningslot; - } else if (completed_measure == 0) { - /* Record the new position */ - orig_exit_slot = runningslot; - } - - INP_WLOCK(inp); - if (inp->inp_hpts_cpu_set == 0) { - set_cpu = true; - } else { - set_cpu = false; - } - - if (__predict_false(inp->inp_in_hpts == IHPTS_MOVING)) { - if (inp->inp_hptsslot == -1) { - inp->inp_in_hpts = IHPTS_NONE; - if (in_pcbrele_wlocked(inp) == false) - INP_WUNLOCK(inp); - } else { - HPTS_LOCK(hpts); - inp_hpts_insert(inp, hpts); - HPTS_UNLOCK(hpts); - INP_WUNLOCK(inp); - } - continue; - } - - MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE); - MPASS(!(inp->inp_flags & INP_DROPPED)); - KASSERT(runningslot == inp->inp_hptsslot, - ("Hpts:%p inp:%p slot mis-aligned %u vs %u", - hpts, inp, runningslot, inp->inp_hptsslot)); - - if (inp->inp_hpts_request) { + if (ntp != NULL) { /* - * This guy is deferred out further in time - * then our wheel had available on it. - * Push him back on the wheel or run it - * depending. - */ - uint32_t maxslots, last_slot, remaining_slots; - - remaining_slots = slots_to_run - (i + 1); - if (inp->inp_hpts_request > remaining_slots) { - HPTS_LOCK(hpts); - /* - * How far out can we go? - */ - maxslots = max_slots_available(hpts, - hpts->p_cur_slot, &last_slot); - if (maxslots >= inp->inp_hpts_request) { - /* We can place it finally to - * be processed. */ - inp->inp_hptsslot = hpts_slot( - hpts->p_runningslot, - inp->inp_hpts_request); - inp->inp_hpts_request = 0; - } else { - /* Work off some more time */ - inp->inp_hptsslot = last_slot; - inp->inp_hpts_request -= - maxslots; - } - inp_hpts_insert(inp, hpts); - HPTS_UNLOCK(hpts); - INP_WUNLOCK(inp); - continue; - } - inp->inp_hpts_request = 0; - /* Fall through we will so do it now */ - } - - inp_hpts_release(inp); - tp = intotcpcb(inp); - MPASS(tp); - if (set_cpu) { - /* - * Setup so the next time we will move to - * the right CPU. This should be a rare - * event. It will sometimes happens when we - * are the client side (usually not the - * server). Somehow tcp_output() gets called - * before the tcp_do_segment() sets the - * intial state. This means the r_cpu and - * r_hpts_cpu is 0. We get on the hpts, and - * then tcp_input() gets called setting up - * the r_cpu to the correct value. The hpts - * goes off and sees the mis-match. We - * simply correct it here and the CPU will - * switch to the new hpts nextime the tcb - * gets added to the hpts (not this one) - * :-) - */ - tcp_set_hpts(inp); - } - CURVNET_SET(inp->inp_vnet); - /* Lets do any logging that we might want to */ - if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { - tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); - } - - if (tp->t_fb_ptr != NULL) { - kern_prefetch(tp->t_fb_ptr, &did_prefetch); - did_prefetch = 1; - } - /* - * We set inp_hpts_calls to 1 before any possible output. - * The contract with the transport is that if it cares about - * hpts calling it should clear the flag. That way next time - * it is called it will know it is hpts. - * - * We also only call tfb_do_queued_segments() tcp_output() - * it is expected that if segments are queued and come in that - * the final input mbuf will cause a call to output if it is needed. - */ - inp->inp_hpts_calls = 1; - if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && - !STAILQ_EMPTY(&tp->t_inqueue)) { - error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0); - if (error) { - /* The input killed the connection */ - goto skip_pacing; - } - } - error = tcp_output(tp); - if (error < 0) - goto skip_pacing; - if (ninp) { - /* - * If we have a nxt inp, see if we can + * If we have a next tcpcb, see if we can * prefetch it. Note this may seem * "risky" since we have no locks (other * than the previous inp) and there no - * assurance that ninp was not pulled while - * we were processing inp and freed. If this + * assurance that ntp was not pulled while + * we were processing tp and freed. If this * occurred it could mean that either: * * a) Its NULL (which is fine we won't go @@ -1384,12 +1237,143 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) * cause us to load cache with a useless * address (to us). * - * XXXGL: with tcpcb == inpcb, I'm unsure this - * prefetch is still correct and useful. + * XXXGL: this comment and the prefetch action + * could be outdated after tp == inp change. */ - kern_prefetch(ninp, &prefetch_tp); + kern_prefetch(ntp, &prefetch_tp); prefetch_tp = 1; } + + /* For debugging */ + if (seen_endpoint == 0) { + seen_endpoint = 1; + orig_exit_slot = slot_pos_of_endpoint = + runningslot; + } else if (completed_measure == 0) { + /* Record the new position */ + orig_exit_slot = runningslot; + } + + INP_WLOCK(inp); + if ((tp->t_flags2 & TF2_HPTS_CPU_SET) == 0) { + set_cpu = true; + } else { + set_cpu = false; + } + + if (__predict_false(tp->t_in_hpts == IHPTS_MOVING)) { + if (tp->t_hpts_slot == -1) { + tp->t_in_hpts = IHPTS_NONE; + if (in_pcbrele_wlocked(inp) == false) + INP_WUNLOCK(inp); + } else { + HPTS_LOCK(hpts); + tcp_hpts_insert_internal(tp, hpts); + HPTS_UNLOCK(hpts); + INP_WUNLOCK(inp); + } + continue; + } + + MPASS(tp->t_in_hpts == IHPTS_ONQUEUE); + MPASS(!(inp->inp_flags & INP_DROPPED)); + KASSERT(runningslot == tp->t_hpts_slot, + ("Hpts:%p inp:%p slot mis-aligned %u vs %u", + hpts, inp, runningslot, tp->t_hpts_slot)); + + if (tp->t_hpts_request) { + /* + * This guy is deferred out further in time + * then our wheel had available on it. + * Push him back on the wheel or run it + * depending. + */ + uint32_t maxslots, last_slot, remaining_slots; + + remaining_slots = slots_to_run - (i + 1); + if (tp->t_hpts_request > remaining_slots) { + HPTS_LOCK(hpts); + /* + * How far out can we go? + */ + maxslots = max_slots_available(hpts, + hpts->p_cur_slot, &last_slot); + if (maxslots >= tp->t_hpts_request) { + /* We can place it finally to + * be processed. */ + tp->t_hpts_slot = hpts_slot( + hpts->p_runningslot, + tp->t_hpts_request); + tp->t_hpts_request = 0; + } else { + /* Work off some more time */ + tp->t_hpts_slot = last_slot; + tp->t_hpts_request -= + maxslots; + } + tcp_hpts_insert_internal(tp, hpts); + HPTS_UNLOCK(hpts); + INP_WUNLOCK(inp); + continue; + } + tp->t_hpts_request = 0; + /* Fall through we will so do it now */ + } + + tcp_hpts_release(tp); + if (set_cpu) { + /* + * Setup so the next time we will move to + * the right CPU. This should be a rare + * event. It will sometimes happens when we + * are the client side (usually not the + * server). Somehow tcp_output() gets called + * before the tcp_do_segment() sets the + * intial state. This means the r_cpu and + * r_hpts_cpu is 0. We get on the hpts, and + * then tcp_input() gets called setting up + * the r_cpu to the correct value. The hpts + * goes off and sees the mis-match. We + * simply correct it here and the CPU will + * switch to the new hpts nextime the tcb + * gets added to the hpts (not this one) + * :-) + */ + tcp_set_hpts(tp); + } + CURVNET_SET(inp->inp_vnet); + /* Lets do any logging that we might want to */ + if (hpts_does_tp_logging && tcp_bblogging_on(tp)) { + tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout); + } + + if (tp->t_fb_ptr != NULL) { + kern_prefetch(tp->t_fb_ptr, &did_prefetch); + did_prefetch = 1; + } + /* + * We set TF2_HPTS_CALLS before any possible output. + * The contract with the transport is that if it cares + * about hpts calling it should clear the flag. That + * way next time it is called it will know it is hpts. + * + * We also only call tfb_do_queued_segments() + * tcp_output(). It is expected that if segments are + * queued and come in that the final input mbuf will + * cause a call to output if it is needed. + */ + tp->t_flags2 |= TF2_HPTS_CALLS; + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && + !STAILQ_EMPTY(&tp->t_inqueue)) { + error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0); + if (error) { + /* The input killed the connection */ + goto skip_pacing; + } + } + error = tcp_output(tp); + if (error < 0) + goto skip_pacing; INP_WUNLOCK(inp); skip_pacing: CURVNET_RESTORE(); @@ -1491,18 +1475,18 @@ tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout) } void -__tcp_set_hpts(struct inpcb *inp, int32_t line) +__tcp_set_hpts(struct tcpcb *tp, int32_t line) { struct tcp_hpts_entry *hpts; int failed; - INP_WLOCK_ASSERT(inp); - hpts = tcp_hpts_lock(inp); - if ((inp->inp_in_hpts == 0) && - (inp->inp_hpts_cpu_set == 0)) { - inp->inp_hpts_cpu = hpts_cpuid(inp, &failed); + INP_WLOCK_ASSERT(tptoinpcb(tp)); + + hpts = tcp_hpts_lock(tp); + if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) { + tp->t_hpts_cpu = hpts_cpuid(tp, &failed); if (failed == 0) - inp->inp_hpts_cpu_set = 1; + tp->t_flags2 |= TF2_HPTS_CPU_SET; } mtx_unlock(&hpts->p_mtx); } diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h index 9bceca0fd34..dfa6eaf79bd 100644 --- a/sys/netinet/tcp_hpts.h +++ b/sys/netinet/tcp_hpts.h @@ -111,10 +111,14 @@ struct hpts_diag { * */ - #ifdef _KERNEL -void tcp_hpts_remove(struct inpcb *); -bool tcp_in_hpts(struct inpcb *); +void tcp_hpts_init(struct tcpcb *); +void tcp_hpts_remove(struct tcpcb *); +static bool +tcp_in_hpts(struct tcpcb *tp) +{ + return (tp->t_in_hpts == IHPTS_ONQUEUE); +} /* * To insert a TCB on the hpts you *must* be holding the @@ -140,20 +144,18 @@ bool tcp_in_hpts(struct inpcb *); * that INP_WLOCK() or from destroying your TCB where again * you should already have the INP_WLOCK(). */ -uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, +uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag); #define tcp_hpts_insert(inp, slot) \ tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL) -void __tcp_set_hpts(struct inpcb *inp, int32_t line); +void __tcp_set_hpts(struct tcpcb *tp, int32_t line); #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__) void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason); void tcp_run_hpts(void); -uint16_t hpts_random_cpu(struct inpcb *inp); - extern int32_t tcp_min_hptsi_time; #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 7cbf535a926..76c345add1f 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1380,10 +1380,8 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } - if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) { - inp->inp_irq_cpu = lc->lro_last_cpu; - inp->inp_irq_cpu_set = 1; - } + if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1) + tp->t_lro_cpu = lc->lro_last_cpu; /* Check if the transport doesn't support the needed optimizations. */ if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) { INP_WUNLOCK(inp); diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index f5cf362a57d..f8c7557150d 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -739,7 +739,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ int32_t delay_calc = 0; uint32_t prev_delay = 0; - if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) { /* A previous call is already set up */ return; } @@ -904,14 +904,14 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); /* * We add the flag here as well if the slot is set, @@ -1050,8 +1050,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { - if (tcp_in_hpts(inp)) - tcp_hpts_remove(inp); + if (tcp_in_hpts(tp)) + tcp_hpts_remove(tp); bbr_timer_cancel(bbr, __LINE__, cts); bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val, 0); @@ -1875,7 +1875,7 @@ bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t ct l->lt_epoch = bbr->r_ctl.rc_lt_epoch; l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; - l->inhpts = tcp_in_hpts(bbr->rc_inp); + l->inhpts = tcp_in_hpts(bbr->rc_tp); l->use_lt_bw = bbr->rc_lt_use_bw; l->pkts_out = bbr->r_ctl.rc_flight_at_input; l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; @@ -2496,7 +2496,7 @@ bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, u log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; - log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot; + log.u_bbr.flex5 = bbr->rc_tp->t_hpts_slot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2; log.u_bbr.flex8 = which; @@ -3953,7 +3953,7 @@ bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_s bbr->rc_tlp_rtx_out = 0; bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate; tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); - if (tcp_in_hpts(bbr->rc_inp) && + if (tcp_in_hpts(bbr->rc_tp) && ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) { /* * When we enter recovery, we need to restart @@ -5209,7 +5209,7 @@ bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); - tcp_hpts_insert(tptoinpcb(tp), HPTS_USEC_TO_SLOTS(left)); + tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(left)); return (1); } bbr->rc_tmr_stopped = 0; @@ -5240,7 +5240,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { uint8_t hpts_removed = 0; - if (tcp_in_hpts(bbr->rc_inp) && + if (tcp_in_hpts(bbr->rc_tp) && (bbr->rc_timer_first == 1)) { /* * If we are canceling timer's when we have the @@ -5248,7 +5248,7 @@ bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) * must remove ourselves from the hpts. */ hpts_removed = 1; - tcp_hpts_remove(bbr->rc_inp); + tcp_hpts_remove(bbr->rc_tp); if (bbr->r_ctl.rc_last_delay_val) { /* Update the last hptsi delay too */ uint32_t time_since_send; @@ -7920,8 +7920,8 @@ bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t li * don't want to transfer forward the time * for our sum's calculations. */ - if (tcp_in_hpts(bbr->rc_inp)) { - tcp_hpts_remove(bbr->rc_inp); + if (tcp_in_hpts(bbr->rc_tp)) { + tcp_hpts_remove(bbr->rc_tp); bbr->rc_timer_first = 0; bbr->r_ctl.rc_hpts_flags = 0; bbr->r_ctl.rc_last_delay_val = 0; @@ -9854,8 +9854,8 @@ bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr) /* We enter in persists, set the flag appropriately */ bbr->rc_in_persist = 1; } - if (tcp_in_hpts(bbr->rc_inp)) { - tcp_hpts_remove(bbr->rc_inp); + if (tcp_in_hpts(bbr->rc_tp)) { + tcp_hpts_remove(bbr->rc_tp); } } @@ -11437,7 +11437,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, } /* Set the flag */ bbr->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; - tcp_set_hpts(inp); + tcp_set_hpts(tp); sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack); } if (thflags & TH_ACK) { @@ -11546,7 +11546,7 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && - (tcp_in_hpts(bbr->rc_inp)) && + (tcp_in_hpts(tp)) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * keep alive not needed if we are hptsi @@ -11554,8 +11554,8 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, */ ; } else { - if (tcp_in_hpts(bbr->rc_inp)) { - tcp_hpts_remove(bbr->rc_inp); + if (tcp_in_hpts(tp)) { + tcp_hpts_remove(tp); if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && (TSTMP_GT(lcts, bbr->rc_pacer_started))) { uint32_t del; @@ -11582,8 +11582,8 @@ bbr_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, bbr_timer_audit(tp, bbr, lcts, &so->so_snd); } /* Clear the flag, it may have been cleared by output but we may not have */ - if ((nxt_pkt == 0) && (inp->inp_hpts_calls)) - inp->inp_hpts_calls = 0; + if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) + tp->t_flags2 &= ~TF2_HPTS_CALLS; /* Do we have a new state */ if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); @@ -11842,7 +11842,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) int32_t slot = 0; struct inpcb *inp; struct sockbuf *sb; - uint32_t hpts_calling; + bool hpts_calling; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; @@ -11853,8 +11853,8 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); cts = tcp_tv_to_usectick(&bbr->rc_tv); inp = bbr->rc_inp; - hpts_calling = inp->inp_hpts_calls; - inp->inp_hpts_calls = 0; + hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); + tp->t_flags2 &= ~TF2_HPTS_CALLS; so = inp->inp_socket; sb = &so->so_snd; if (tp->t_nic_ktls_xmit) @@ -11884,7 +11884,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) } #endif if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && - tcp_in_hpts(inp)) { + tcp_in_hpts(tp)) { /* * We are on the hpts for some timer but not hptsi output. * Possibly remove from the hpts so we can send/recv etc. @@ -11913,7 +11913,7 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) return (0); } } - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); bbr_timer_cancel(bbr, __LINE__, cts); } if (bbr->r_ctl.rc_last_delay_val) { @@ -11929,9 +11929,9 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) if ((bbr->r_timer_override) || (tp->t_state < TCPS_ESTABLISHED)) { /* Timeouts or early states are exempt */ - if (tcp_in_hpts(inp)) - tcp_hpts_remove(inp); - } else if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) + tcp_hpts_remove(tp); + } else if (tcp_in_hpts(tp)) { if ((bbr->r_ctl.rc_last_delay_val) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && delay_calc) { @@ -11943,10 +11943,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) */ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1); bbr->r_ctl.rc_last_delay_val = 0; - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); } else if (tp->t_state == TCPS_CLOSED) { bbr->r_ctl.rc_last_delay_val = 0; - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); } else { /* * On the hpts, you shall not pass! even if ACKNOW @@ -14088,7 +14088,7 @@ bbr_switch_failed(struct tcpcb *tp) inp->inp_flags2 |= INP_CANNOT_DO_ECN; inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); - if (inp->inp_in_hpts) { + if (tp->t_in_hpts > IHPTS_NONE) { return; } bbr = (struct tcp_bbr *)tp->t_fb_ptr; @@ -14109,7 +14109,7 @@ bbr_switch_failed(struct tcpcb *tp) } } else toval = HPTS_TICKS_PER_SLOT; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), __LINE__, &diag); bbr_log_hpts_diag(bbr, cts, &diag); } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index e0130e7fea2..9e531a1d318 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -2579,7 +2579,7 @@ rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t log.u_bbr.flex5 = rsm->r_start; log.u_bbr.flex6 = rsm->r_end; log.u_bbr.flex8 = mod; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -2605,7 +2605,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; - log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; + log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; log.u_bbr.flex8 = which; @@ -2613,7 +2613,7 @@ rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot log.u_bbr.pkts_out = 0; else log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; @@ -2640,7 +2640,7 @@ rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rs struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; @@ -2678,7 +2678,7 @@ rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex8 = flag; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.cur_del_rate = (uint64_t)prev; log.u_bbr.delRate = (uint64_t)rsm; log.u_bbr.rttProp = (uint64_t)next; @@ -2722,7 +2722,7 @@ rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t l union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; @@ -2894,7 +2894,7 @@ rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; @@ -2920,7 +2920,7 @@ rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_ union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; @@ -2968,7 +2968,7 @@ rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_ log.u_bbr.flex7 <<= 1; log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ log.u_bbr.flex8 = rack->rc_in_persist; - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; @@ -3021,7 +3021,7 @@ rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, ui struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; @@ -3054,7 +3054,7 @@ rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32 union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; log.u_bbr.flex3 = flags_on_entry; @@ -4904,7 +4904,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, rack->r_ctl.rc_app_limited_cnt, 0, 0, 10, __LINE__, NULL, quality); } - if (tcp_in_hpts(rack->rc_inp) && + if (tcp_in_hpts(rack->rc_tp) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * Ok we can't trust the pacer in this case @@ -4914,7 +4914,7 @@ rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, * Stop the pacer and clear up all the aggregate * delays etc. */ - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); rack->r_ctl.rc_hpts_flags = 0; rack->r_ctl.rc_last_output_to = 0; } @@ -6506,8 +6506,8 @@ rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) struct timeval tv; uint32_t t_time; - if (tcp_in_hpts(rack->rc_inp)) { - tcp_hpts_remove(rack->rc_inp); + if (tcp_in_hpts(rack->rc_tp)) { + tcp_hpts_remove(rack->rc_tp); rack->r_ctl.rc_hpts_flags = 0; } #ifdef NETFLIX_SHARED_CWND @@ -6645,7 +6645,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, (tp->t_state == TCPS_LISTEN)) { return; } - if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) { /* Already on the pacer */ return; } @@ -6896,12 +6896,12 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 1); @@ -6916,7 +6916,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, * at the start of this block) are good enough. */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); @@ -8039,7 +8039,7 @@ rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8 rack->rc_inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; ret = -3; left = rack->r_ctl.rc_timer_exp - cts; - tcp_hpts_insert(tptoinpcb(tp), HPTS_MS_TO_SLOTS(left)); + tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); return (1); } @@ -8080,7 +8080,7 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || ((tp->snd_max - tp->snd_una) == 0))) { - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); hpts_removed = 1; /* If we were not delayed cancel out the flag. */ if ((tp->snd_max - tp->snd_una) == 0) @@ -8089,14 +8089,14 @@ rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int lin } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; - if (tcp_in_hpts(rack->rc_inp) && + if (tcp_in_hpts(rack->rc_tp) && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { /* * Canceling timer's when we have no output being * paced. We also must remove ourselves from the * hpts. */ - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); hpts_removed = 1; } rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); @@ -8124,8 +8124,8 @@ rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) /* We enter in persists, set the flag appropriately */ rack->rc_in_persist = 1; } - if (tcp_in_hpts(rack->rc_inp)) { - tcp_hpts_remove(rack->rc_inp); + if (tcp_in_hpts(rack->rc_tp)) { + tcp_hpts_remove(rack->rc_tp); } } @@ -11394,7 +11394,7 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered (entered_recovery == 0)) { rack_update_prr(tp, rack, changed, th_ack); if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && - ((tcp_in_hpts(rack->rc_inp) == 0) && + ((tcp_in_hpts(rack->rc_tp) == 0) && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { /* * If you are pacing output you don't want @@ -14583,7 +14583,7 @@ rack_switch_failed(struct tcpcb *tp) inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; - if (inp->inp_in_hpts) { + if (tp->t_in_hpts > IHPTS_NONE) { /* Strange */ return; } @@ -14604,7 +14604,7 @@ rack_switch_failed(struct tcpcb *tp) } } else toval = HPTS_TICKS_PER_SLOT; - (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), __LINE__, &diag); rack_log_hpts_diag(rack, cts, &diag, &tv); } @@ -15201,7 +15201,7 @@ rack_init(struct tcpcb *tp, void **ptr) if (tov) { struct hpts_diag diag; - (void)tcp_hpts_insert_diag(rack->rc_inp, HPTS_USEC_TO_SLOTS(tov), + (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); } @@ -15487,7 +15487,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ - if (tcp_in_hpts(rack->rc_inp)) { + if (tcp_in_hpts(rack->rc_tp)) { if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { uint32_t us_cts; @@ -15498,7 +15498,7 @@ rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) } rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } - tcp_hpts_remove(rack->rc_inp); + tcp_hpts_remove(rack->rc_tp); } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); @@ -15579,7 +15579,7 @@ rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent } #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else @@ -16438,8 +16438,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb } did_out = 1; } - if (rack->rc_inp->inp_hpts_calls) - rack->rc_inp->inp_hpts_calls = 0; + if (tp->t_flags2 & TF2_HPTS_CALLS) + tp->t_flags2 &= ~TF2_HPTS_CALLS; rack_free_trim(rack); #ifdef TCP_ACCOUNTING sched_unpin(); @@ -16673,7 +16673,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, } #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else @@ -16900,7 +16900,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, #endif return (1); } - tcp_set_hpts(inp); + tcp_set_hpts(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } if (thflags & TH_FIN) @@ -16999,7 +16999,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, rack_free_trim(rack); } else if ((no_output == 1) && (nxt_pkt == 0) && - (tcp_in_hpts(rack->rc_inp) == 0)) { + (tcp_in_hpts(rack->rc_tp) == 0)) { /* * We are not in hpts and we had a pacing timer up. Use * the remaining time (slot_remaining) to restart the timer. @@ -17009,8 +17009,8 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, rack_free_trim(rack); } /* Clear the flag, it may have been cleared by output but we may not have */ - if ((nxt_pkt == 0) && (inp->inp_hpts_calls)) - inp->inp_hpts_calls = 0; + if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) + tp->t_flags2 &= ~TF2_HPTS_CALLS; /* Update any rounds needed */ if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) rack_log_hystart_event(rack, high_seq, 8); @@ -17044,13 +17044,13 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && - (tcp_in_hpts(rack->rc_inp)) && + (tcp_in_hpts(rack->rc_tp)) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { int late = 0; - if (tcp_in_hpts(inp)) { + if (tcp_in_hpts(tp)) { if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { us_cts = tcp_get_usecs(NULL); if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { @@ -17060,7 +17060,7 @@ rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, late = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); } if (late && (did_out == 0)) { /* @@ -18074,7 +18074,7 @@ rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = error; log.u_bbr.flex2 = flags; log.u_bbr.flex3 = rsm_is_null; @@ -18339,7 +18339,7 @@ rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = p_rate; log.u_bbr.flex2 = p_queue; log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; @@ -18404,7 +18404,7 @@ rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, out: if (tcp_bblogging_on(tp)) { memset(&log, 0, sizeof(log)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); log.u_bbr.flex1 = p_rate; log.u_bbr.flex2 = p_queue; log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; @@ -18769,7 +18769,7 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); } memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else @@ -19302,7 +19302,7 @@ rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else @@ -19634,7 +19634,7 @@ rack_output(struct tcpcb *tp) uint32_t cts, ms_cts, delayed, early; uint16_t add_flag = RACK_SENT_SP; /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ - uint8_t hpts_calling, doing_tlp = 0; + uint8_t doing_tlp = 0; uint32_t cwnd_to_use, pace_max_seg; int32_t do_a_prefetch = 0; int32_t prefetch_rsm = 0; @@ -19652,7 +19652,7 @@ rack_output(struct tcpcb *tp) struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif - bool hw_tls = false; + bool hpts_calling, hw_tls = false; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); @@ -19663,8 +19663,8 @@ rack_output(struct tcpcb *tp) sched_pin(); ts_val = get_cyclecount(); #endif - hpts_calling = inp->inp_hpts_calls; - rack->rc_inp->inp_hpts_calls = 0; + hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); + tp->t_flags2 &= ~TF2_HPTS_CALLS; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { #ifdef TCP_ACCOUNTING @@ -19707,7 +19707,7 @@ rack_output(struct tcpcb *tp) cts = tcp_get_usecs(&tv); ms_cts = tcp_tv_to_mssectick(&tv); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && - tcp_in_hpts(rack->rc_inp)) { + tcp_in_hpts(rack->rc_tp)) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. @@ -19741,7 +19741,7 @@ rack_output(struct tcpcb *tp) } } if (rack->rc_in_persist) { - if (tcp_in_hpts(rack->rc_inp) == 0) { + if (tcp_in_hpts(rack->rc_tp) == 0) { /* Timer is not running */ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } @@ -19753,7 +19753,7 @@ rack_output(struct tcpcb *tp) if ((rack->rc_ack_required == 1) && (rack->r_timer_override == 0)){ /* A timeout occurred and no ack has arrived */ - if (tcp_in_hpts(rack->rc_inp) == 0) { + if (tcp_in_hpts(rack->rc_tp) == 0) { /* Timer is not running */ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } @@ -19767,9 +19767,9 @@ rack_output(struct tcpcb *tp) (delayed) || (tp->t_state < TCPS_ESTABLISHED)) { rack->rc_ack_can_sendout_data = 0; - if (tcp_in_hpts(rack->rc_inp)) - tcp_hpts_remove(rack->rc_inp); - } else if (tcp_in_hpts(rack->rc_inp)) { + if (tcp_in_hpts(rack->rc_tp)) + tcp_hpts_remove(rack->rc_tp); + } else if (tcp_in_hpts(rack->rc_tp)) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. @@ -21683,7 +21683,7 @@ rack_output(struct tcpcb *tp) union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); + log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index c57eedef151..40dd9b7f3aa 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2148,7 +2148,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = inp->inp_in_hpts; + log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); @@ -2315,11 +2315,7 @@ tcp_newtcpcb(struct inpcb *inp) */ inp->inp_ip_ttl = V_ip_defttl; #ifdef TCPHPTS - /* - * If using hpts lets drop a random number in so - * not all new connections fall on the same CPU. - */ - inp->inp_hpts_cpu = hpts_random_cpu(inp); + tcp_hpts_init(tp); #endif #ifdef TCPPCAP /* @@ -2434,6 +2430,7 @@ tcp_discardcb(struct tcpcb *tp) if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); + MPASS(!tcp_in_hpts(tp)); #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif @@ -2529,7 +2526,7 @@ tcp_close(struct tcpcb *tp) tp->t_tfo_pending = NULL; } #ifdef TCPHPTS - tcp_hpts_remove(inp); + tcp_hpts_remove(tp); #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 124a254cae3..0d144cb04e5 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -80,7 +80,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index a613e5fbf2b..d23dd9f9722 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1712,7 +1712,7 @@ tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) */ #ifdef TCPHPTS /* Assure that we are not on any hpts */ - tcp_hpts_remove(tptoinpcb(tp)); + tcp_hpts_remove(tp); #endif if (blk->tfb_tcp_fb_init) { error = (*blk->tfb_tcp_fb_init)(tp, &ptr); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index f80e9fc37ff..a3016a143b9 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -314,6 +314,23 @@ struct tcpcb { sbintime_t t_timers[TT_N]; sbintime_t t_precisions[TT_N]; + /* HPTS. Used by BBR and Rack stacks. See tcp_hpts.c for more info. */ + TAILQ_ENTRY(tcpcb) t_hpts; /* linkage to HPTS ring */ + STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input packets queue */ + uint32_t t_hpts_request; /* Current hpts request, zero if + * fits in the pacing window. */ + uint32_t t_hpts_slot; /* HPTS wheel slot this tcb is. */ + uint32_t t_hpts_drop_reas; /* Reason we are dropping the pcb. */ + uint32_t t_hpts_gencnt; + uint16_t t_hpts_cpu; /* CPU chosen by hpts_cpuid(). */ + uint16_t t_lro_cpu; /* CPU derived from LRO. */ +#define HPTS_CPU_NONE ((uint16_t)-1) + enum { + IHPTS_NONE = 0, + IHPTS_ONQUEUE, + IHPTS_MOVING, + } t_in_hpts; /* Is it linked into HPTS? */ + uint32_t t_maxseg:24, /* maximum segment size */ _t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ @@ -355,7 +372,6 @@ struct tcpcb { int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* total reassembly queue byte length */ struct tsegqe_head t_segq; /* segment reassembly queue */ - STAILQ_HEAD(, mbuf) t_inqueue; /* HPTS input queue */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch @@ -832,9 +848,11 @@ tcp_packets_this_ack(struct tcpcb *tp, tcp_seq ack) #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ +#define TF2_HPTS_CPU_SET 0x00000200 /* t_hpts_cpu is not random */ #define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */ #define TF2_ECN_USE_ECT1 0x00000800 /* Use ECT(1) marking on session */ #define TF2_TCP_ACCOUNTING 0x00001000 /* Do TCP accounting */ +#define TF2_HPTS_CALLS 0x00002000 /* tcp_output() called via HPTS */ /* * Structure to hold TCP options that are only used during segment