1
0
mirror of https://git.FreeBSD.org/src.git synced 2025-01-08 13:28:05 +00:00

Reduce the cost of TLB invalidation on x86 by using per-CPU completion flags

Reduce contention during TLB invalidation operations by using a per-CPU
completion flag, rather than a single atomically-updated variable.

On a Westmere system (2 sockets x 4 cores x 1 threads), dtrace measurements
show that smp_tlb_shootdown is about 50% faster with this patch; observations
with VTune show that the percentage of time spent in invlrng_single_page on an
interrupt (actually doing invalidation, rather than synchronization) increases
from 31% with the old mechanism to 71% with the new one.  (Running a basic file
server workload.)

Submitted by:	Anton Rang <rang at acm.org>
Reviewed by:	cem (earlier version), kib
Sponsored by:	Dell EMC Isilon
Differential Revision:	https://reviews.freebsd.org/D8041
This commit is contained in:
Conrad Meyer 2016-09-30 18:12:16 +00:00
parent 106961b640
commit 2965d505f6
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=306516
4 changed files with 45 additions and 21 deletions

View File

@ -409,6 +409,7 @@ void
invltlb_invpcid_handler(void) invltlb_invpcid_handler(void)
{ {
struct invpcid_descr d; struct invpcid_descr d;
uint64_t generation;
#ifdef COUNT_XINVLTLB_HITS #ifdef COUNT_XINVLTLB_HITS
xhits_gbl[PCPU_GET(cpuid)]++; xhits_gbl[PCPU_GET(cpuid)]++;
@ -417,17 +418,20 @@ invltlb_invpcid_handler(void)
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++; (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */ #endif /* COUNT_IPIS */
generation = smp_tlb_generation;
d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
d.pad = 0; d.pad = 0;
d.addr = 0; d.addr = 0;
invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB : invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
INVPCID_CTX); INVPCID_CTX);
atomic_add_int(&smp_tlb_wait, 1); PCPU_SET(smp_tlb_done, generation);
} }
void void
invltlb_pcid_handler(void) invltlb_pcid_handler(void)
{ {
uint64_t generation;
#ifdef COUNT_XINVLTLB_HITS #ifdef COUNT_XINVLTLB_HITS
xhits_gbl[PCPU_GET(cpuid)]++; xhits_gbl[PCPU_GET(cpuid)]++;
#endif /* COUNT_XINVLTLB_HITS */ #endif /* COUNT_XINVLTLB_HITS */
@ -435,6 +439,7 @@ invltlb_pcid_handler(void)
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++; (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */ #endif /* COUNT_IPIS */
generation = smp_tlb_generation;
if (smp_tlb_pmap == kernel_pmap) { if (smp_tlb_pmap == kernel_pmap) {
invltlb_glob(); invltlb_glob();
} else { } else {
@ -450,5 +455,5 @@ invltlb_pcid_handler(void)
smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid); smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
} }
} }
atomic_add_int(&smp_tlb_wait, 1); PCPU_SET(smp_tlb_done, generation);
} }

View File

@ -65,7 +65,8 @@
u_int pc_vcpu_id; /* Xen vCPU ID */ \ u_int pc_vcpu_id; /* Xen vCPU ID */ \
uint32_t pc_pcid_next; \ uint32_t pc_pcid_next; \
uint32_t pc_pcid_gen; \ uint32_t pc_pcid_gen; \
char __pad[149] /* be divisor of PAGE_SIZE \ uint64_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
char __pad[141] /* be divisor of PAGE_SIZE \
after cache alignment */ after cache alignment */
#define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_NONE 0

View File

@ -35,7 +35,7 @@ extern volatile int aps_ready;
extern struct mtx ap_boot_mtx; extern struct mtx ap_boot_mtx;
extern int cpu_logical; extern int cpu_logical;
extern int cpu_cores; extern int cpu_cores;
extern volatile int smp_tlb_wait; extern volatile uint64_t smp_tlb_generation;
extern struct pmap *smp_tlb_pmap; extern struct pmap *smp_tlb_pmap;
extern u_int xhits_gbl[]; extern u_int xhits_gbl[];
extern u_int xhits_pg[]; extern u_int xhits_pg[];

View File

@ -1304,12 +1304,15 @@ cpususpend_handler(void)
void void
invlcache_handler(void) invlcache_handler(void)
{ {
uint64_t generation;
#ifdef COUNT_IPIS #ifdef COUNT_IPIS
(*ipi_invlcache_counts[PCPU_GET(cpuid)])++; (*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */ #endif /* COUNT_IPIS */
generation = smp_tlb_generation;
wbinvd(); wbinvd();
atomic_add_int(&smp_tlb_wait, 1); PCPU_SET(smp_tlb_done, generation);
} }
/* /*
@ -1367,7 +1370,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
/* Variables needed for SMP tlb shootdown. */ /* Variables needed for SMP tlb shootdown. */
static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
pmap_t smp_tlb_pmap; pmap_t smp_tlb_pmap;
volatile int smp_tlb_wait; volatile uint64_t smp_tlb_generation;
#ifdef __amd64__ #ifdef __amd64__
#define read_eflags() read_rflags() #define read_eflags() read_rflags()
@ -1377,15 +1380,16 @@ static void
smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
vm_offset_t addr1, vm_offset_t addr2) vm_offset_t addr1, vm_offset_t addr2)
{ {
int cpu, ncpu, othercpus; cpuset_t other_cpus;
volatile uint64_t *p_cpudone;
othercpus = mp_ncpus - 1; /* does not shootdown self */ uint64_t generation;
int cpu;
/* /*
* Check for other cpus. Return if none. * Check for other cpus. Return if none.
*/ */
if (CPU_ISFULLSET(&mask)) { if (CPU_ISFULLSET(&mask)) {
if (othercpus < 1) if (mp_ncpus <= 1)
return; return;
} else { } else {
CPU_CLR(PCPU_GET(cpuid), &mask); CPU_CLR(PCPU_GET(cpuid), &mask);
@ -1399,23 +1403,28 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
smp_tlb_addr1 = addr1; smp_tlb_addr1 = addr1;
smp_tlb_addr2 = addr2; smp_tlb_addr2 = addr2;
smp_tlb_pmap = pmap; smp_tlb_pmap = pmap;
smp_tlb_wait = 0; generation = ++smp_tlb_generation;
if (CPU_ISFULLSET(&mask)) { if (CPU_ISFULLSET(&mask)) {
ncpu = othercpus;
ipi_all_but_self(vector); ipi_all_but_self(vector);
other_cpus = all_cpus;
CPU_CLR(PCPU_GET(cpuid), &other_cpus);
} else { } else {
ncpu = 0; other_cpus = mask;
while ((cpu = CPU_FFS(&mask)) != 0) { while ((cpu = CPU_FFS(&mask)) != 0) {
cpu--; cpu--;
CPU_CLR(cpu, &mask); CPU_CLR(cpu, &mask);
CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
cpu, vector); cpu, vector);
ipi_send_cpu(cpu, vector); ipi_send_cpu(cpu, vector);
ncpu++;
} }
} }
while (smp_tlb_wait < ncpu) while ((cpu = CPU_FFS(&other_cpus)) != 0) {
ia32_pause(); cpu--;
CPU_CLR(cpu, &other_cpus);
p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
while (*p_cpudone != generation)
ia32_pause();
}
mtx_unlock_spin(&smp_ipi_mtx); mtx_unlock_spin(&smp_ipi_mtx);
} }
@ -1473,6 +1482,8 @@ smp_cache_flush(void)
void void
invltlb_handler(void) invltlb_handler(void)
{ {
uint64_t generation;
#ifdef COUNT_XINVLTLB_HITS #ifdef COUNT_XINVLTLB_HITS
xhits_gbl[PCPU_GET(cpuid)]++; xhits_gbl[PCPU_GET(cpuid)]++;
#endif /* COUNT_XINVLTLB_HITS */ #endif /* COUNT_XINVLTLB_HITS */
@ -1480,16 +1491,19 @@ invltlb_handler(void)
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++; (*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */ #endif /* COUNT_IPIS */
generation = smp_tlb_generation;
if (smp_tlb_pmap == kernel_pmap) if (smp_tlb_pmap == kernel_pmap)
invltlb_glob(); invltlb_glob();
else else
invltlb(); invltlb();
atomic_add_int(&smp_tlb_wait, 1); PCPU_SET(smp_tlb_done, generation);
} }
void void
invlpg_handler(void) invlpg_handler(void)
{ {
uint64_t generation;
#ifdef COUNT_XINVLTLB_HITS #ifdef COUNT_XINVLTLB_HITS
xhits_pg[PCPU_GET(cpuid)]++; xhits_pg[PCPU_GET(cpuid)]++;
#endif /* COUNT_XINVLTLB_HITS */ #endif /* COUNT_XINVLTLB_HITS */
@ -1497,14 +1511,16 @@ invlpg_handler(void)
(*ipi_invlpg_counts[PCPU_GET(cpuid)])++; (*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
#endif /* COUNT_IPIS */ #endif /* COUNT_IPIS */
generation = smp_tlb_generation;
invlpg(smp_tlb_addr1); invlpg(smp_tlb_addr1);
atomic_add_int(&smp_tlb_wait, 1); PCPU_SET(smp_tlb_done, generation);
} }
void void
invlrng_handler(void) invlrng_handler(void)
{ {
vm_offset_t addr; vm_offset_t addr, addr2;
uint64_t generation;
#ifdef COUNT_XINVLTLB_HITS #ifdef COUNT_XINVLTLB_HITS
xhits_rng[PCPU_GET(cpuid)]++; xhits_rng[PCPU_GET(cpuid)]++;
@ -1514,10 +1530,12 @@ invlrng_handler(void)
#endif /* COUNT_IPIS */ #endif /* COUNT_IPIS */
addr = smp_tlb_addr1; addr = smp_tlb_addr1;
addr2 = smp_tlb_addr2;
generation = smp_tlb_generation;
do { do {
invlpg(addr); invlpg(addr);
addr += PAGE_SIZE; addr += PAGE_SIZE;
} while (addr < smp_tlb_addr2); } while (addr < addr2);
atomic_add_int(&smp_tlb_wait, 1); PCPU_SET(smp_tlb_done, generation);
} }