mirror of
https://git.FreeBSD.org/src.git
synced 2025-01-08 13:28:05 +00:00
Reduce the cost of TLB invalidation on x86 by using per-CPU completion flags
Reduce contention during TLB invalidation operations by using a per-CPU completion flag, rather than a single atomically-updated variable. On a Westmere system (2 sockets x 4 cores x 1 threads), dtrace measurements show that smp_tlb_shootdown is about 50% faster with this patch; observations with VTune show that the percentage of time spent in invlrng_single_page on an interrupt (actually doing invalidation, rather than synchronization) increases from 31% with the old mechanism to 71% with the new one. (Running a basic file server workload.) Submitted by: Anton Rang <rang at acm.org> Reviewed by: cem (earlier version), kib Sponsored by: Dell EMC Isilon Differential Revision: https://reviews.freebsd.org/D8041
This commit is contained in:
parent
106961b640
commit
2965d505f6
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=306516
@ -409,6 +409,7 @@ void
|
|||||||
invltlb_invpcid_handler(void)
|
invltlb_invpcid_handler(void)
|
||||||
{
|
{
|
||||||
struct invpcid_descr d;
|
struct invpcid_descr d;
|
||||||
|
uint64_t generation;
|
||||||
|
|
||||||
#ifdef COUNT_XINVLTLB_HITS
|
#ifdef COUNT_XINVLTLB_HITS
|
||||||
xhits_gbl[PCPU_GET(cpuid)]++;
|
xhits_gbl[PCPU_GET(cpuid)]++;
|
||||||
@ -417,17 +418,20 @@ invltlb_invpcid_handler(void)
|
|||||||
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
||||||
#endif /* COUNT_IPIS */
|
#endif /* COUNT_IPIS */
|
||||||
|
|
||||||
|
generation = smp_tlb_generation;
|
||||||
d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
|
d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
|
||||||
d.pad = 0;
|
d.pad = 0;
|
||||||
d.addr = 0;
|
d.addr = 0;
|
||||||
invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
|
invpcid(&d, smp_tlb_pmap == kernel_pmap ? INVPCID_CTXGLOB :
|
||||||
INVPCID_CTX);
|
INVPCID_CTX);
|
||||||
atomic_add_int(&smp_tlb_wait, 1);
|
PCPU_SET(smp_tlb_done, generation);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
invltlb_pcid_handler(void)
|
invltlb_pcid_handler(void)
|
||||||
{
|
{
|
||||||
|
uint64_t generation;
|
||||||
|
|
||||||
#ifdef COUNT_XINVLTLB_HITS
|
#ifdef COUNT_XINVLTLB_HITS
|
||||||
xhits_gbl[PCPU_GET(cpuid)]++;
|
xhits_gbl[PCPU_GET(cpuid)]++;
|
||||||
#endif /* COUNT_XINVLTLB_HITS */
|
#endif /* COUNT_XINVLTLB_HITS */
|
||||||
@ -435,6 +439,7 @@ invltlb_pcid_handler(void)
|
|||||||
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
||||||
#endif /* COUNT_IPIS */
|
#endif /* COUNT_IPIS */
|
||||||
|
|
||||||
|
generation = smp_tlb_generation;
|
||||||
if (smp_tlb_pmap == kernel_pmap) {
|
if (smp_tlb_pmap == kernel_pmap) {
|
||||||
invltlb_glob();
|
invltlb_glob();
|
||||||
} else {
|
} else {
|
||||||
@ -450,5 +455,5 @@ invltlb_pcid_handler(void)
|
|||||||
smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
|
smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
atomic_add_int(&smp_tlb_wait, 1);
|
PCPU_SET(smp_tlb_done, generation);
|
||||||
}
|
}
|
||||||
|
@ -65,7 +65,8 @@
|
|||||||
u_int pc_vcpu_id; /* Xen vCPU ID */ \
|
u_int pc_vcpu_id; /* Xen vCPU ID */ \
|
||||||
uint32_t pc_pcid_next; \
|
uint32_t pc_pcid_next; \
|
||||||
uint32_t pc_pcid_gen; \
|
uint32_t pc_pcid_gen; \
|
||||||
char __pad[149] /* be divisor of PAGE_SIZE \
|
uint64_t pc_smp_tlb_done; /* TLB op acknowledgement */ \
|
||||||
|
char __pad[141] /* be divisor of PAGE_SIZE \
|
||||||
after cache alignment */
|
after cache alignment */
|
||||||
|
|
||||||
#define PC_DBREG_CMD_NONE 0
|
#define PC_DBREG_CMD_NONE 0
|
||||||
|
@ -35,7 +35,7 @@ extern volatile int aps_ready;
|
|||||||
extern struct mtx ap_boot_mtx;
|
extern struct mtx ap_boot_mtx;
|
||||||
extern int cpu_logical;
|
extern int cpu_logical;
|
||||||
extern int cpu_cores;
|
extern int cpu_cores;
|
||||||
extern volatile int smp_tlb_wait;
|
extern volatile uint64_t smp_tlb_generation;
|
||||||
extern struct pmap *smp_tlb_pmap;
|
extern struct pmap *smp_tlb_pmap;
|
||||||
extern u_int xhits_gbl[];
|
extern u_int xhits_gbl[];
|
||||||
extern u_int xhits_pg[];
|
extern u_int xhits_pg[];
|
||||||
|
@ -1304,12 +1304,15 @@ cpususpend_handler(void)
|
|||||||
void
|
void
|
||||||
invlcache_handler(void)
|
invlcache_handler(void)
|
||||||
{
|
{
|
||||||
|
uint64_t generation;
|
||||||
|
|
||||||
#ifdef COUNT_IPIS
|
#ifdef COUNT_IPIS
|
||||||
(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
|
(*ipi_invlcache_counts[PCPU_GET(cpuid)])++;
|
||||||
#endif /* COUNT_IPIS */
|
#endif /* COUNT_IPIS */
|
||||||
|
|
||||||
|
generation = smp_tlb_generation;
|
||||||
wbinvd();
|
wbinvd();
|
||||||
atomic_add_int(&smp_tlb_wait, 1);
|
PCPU_SET(smp_tlb_done, generation);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1367,7 +1370,7 @@ SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL);
|
|||||||
/* Variables needed for SMP tlb shootdown. */
|
/* Variables needed for SMP tlb shootdown. */
|
||||||
static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
|
static vm_offset_t smp_tlb_addr1, smp_tlb_addr2;
|
||||||
pmap_t smp_tlb_pmap;
|
pmap_t smp_tlb_pmap;
|
||||||
volatile int smp_tlb_wait;
|
volatile uint64_t smp_tlb_generation;
|
||||||
|
|
||||||
#ifdef __amd64__
|
#ifdef __amd64__
|
||||||
#define read_eflags() read_rflags()
|
#define read_eflags() read_rflags()
|
||||||
@ -1377,15 +1380,16 @@ static void
|
|||||||
smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
|
smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
|
||||||
vm_offset_t addr1, vm_offset_t addr2)
|
vm_offset_t addr1, vm_offset_t addr2)
|
||||||
{
|
{
|
||||||
int cpu, ncpu, othercpus;
|
cpuset_t other_cpus;
|
||||||
|
volatile uint64_t *p_cpudone;
|
||||||
othercpus = mp_ncpus - 1; /* does not shootdown self */
|
uint64_t generation;
|
||||||
|
int cpu;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check for other cpus. Return if none.
|
* Check for other cpus. Return if none.
|
||||||
*/
|
*/
|
||||||
if (CPU_ISFULLSET(&mask)) {
|
if (CPU_ISFULLSET(&mask)) {
|
||||||
if (othercpus < 1)
|
if (mp_ncpus <= 1)
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
CPU_CLR(PCPU_GET(cpuid), &mask);
|
CPU_CLR(PCPU_GET(cpuid), &mask);
|
||||||
@ -1399,23 +1403,28 @@ smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap,
|
|||||||
smp_tlb_addr1 = addr1;
|
smp_tlb_addr1 = addr1;
|
||||||
smp_tlb_addr2 = addr2;
|
smp_tlb_addr2 = addr2;
|
||||||
smp_tlb_pmap = pmap;
|
smp_tlb_pmap = pmap;
|
||||||
smp_tlb_wait = 0;
|
generation = ++smp_tlb_generation;
|
||||||
if (CPU_ISFULLSET(&mask)) {
|
if (CPU_ISFULLSET(&mask)) {
|
||||||
ncpu = othercpus;
|
|
||||||
ipi_all_but_self(vector);
|
ipi_all_but_self(vector);
|
||||||
|
other_cpus = all_cpus;
|
||||||
|
CPU_CLR(PCPU_GET(cpuid), &other_cpus);
|
||||||
} else {
|
} else {
|
||||||
ncpu = 0;
|
other_cpus = mask;
|
||||||
while ((cpu = CPU_FFS(&mask)) != 0) {
|
while ((cpu = CPU_FFS(&mask)) != 0) {
|
||||||
cpu--;
|
cpu--;
|
||||||
CPU_CLR(cpu, &mask);
|
CPU_CLR(cpu, &mask);
|
||||||
CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
|
CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__,
|
||||||
cpu, vector);
|
cpu, vector);
|
||||||
ipi_send_cpu(cpu, vector);
|
ipi_send_cpu(cpu, vector);
|
||||||
ncpu++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (smp_tlb_wait < ncpu)
|
while ((cpu = CPU_FFS(&other_cpus)) != 0) {
|
||||||
ia32_pause();
|
cpu--;
|
||||||
|
CPU_CLR(cpu, &other_cpus);
|
||||||
|
p_cpudone = &cpuid_to_pcpu[cpu]->pc_smp_tlb_done;
|
||||||
|
while (*p_cpudone != generation)
|
||||||
|
ia32_pause();
|
||||||
|
}
|
||||||
mtx_unlock_spin(&smp_ipi_mtx);
|
mtx_unlock_spin(&smp_ipi_mtx);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1473,6 +1482,8 @@ smp_cache_flush(void)
|
|||||||
void
|
void
|
||||||
invltlb_handler(void)
|
invltlb_handler(void)
|
||||||
{
|
{
|
||||||
|
uint64_t generation;
|
||||||
|
|
||||||
#ifdef COUNT_XINVLTLB_HITS
|
#ifdef COUNT_XINVLTLB_HITS
|
||||||
xhits_gbl[PCPU_GET(cpuid)]++;
|
xhits_gbl[PCPU_GET(cpuid)]++;
|
||||||
#endif /* COUNT_XINVLTLB_HITS */
|
#endif /* COUNT_XINVLTLB_HITS */
|
||||||
@ -1480,16 +1491,19 @@ invltlb_handler(void)
|
|||||||
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
(*ipi_invltlb_counts[PCPU_GET(cpuid)])++;
|
||||||
#endif /* COUNT_IPIS */
|
#endif /* COUNT_IPIS */
|
||||||
|
|
||||||
|
generation = smp_tlb_generation;
|
||||||
if (smp_tlb_pmap == kernel_pmap)
|
if (smp_tlb_pmap == kernel_pmap)
|
||||||
invltlb_glob();
|
invltlb_glob();
|
||||||
else
|
else
|
||||||
invltlb();
|
invltlb();
|
||||||
atomic_add_int(&smp_tlb_wait, 1);
|
PCPU_SET(smp_tlb_done, generation);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
invlpg_handler(void)
|
invlpg_handler(void)
|
||||||
{
|
{
|
||||||
|
uint64_t generation;
|
||||||
|
|
||||||
#ifdef COUNT_XINVLTLB_HITS
|
#ifdef COUNT_XINVLTLB_HITS
|
||||||
xhits_pg[PCPU_GET(cpuid)]++;
|
xhits_pg[PCPU_GET(cpuid)]++;
|
||||||
#endif /* COUNT_XINVLTLB_HITS */
|
#endif /* COUNT_XINVLTLB_HITS */
|
||||||
@ -1497,14 +1511,16 @@ invlpg_handler(void)
|
|||||||
(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
|
(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
|
||||||
#endif /* COUNT_IPIS */
|
#endif /* COUNT_IPIS */
|
||||||
|
|
||||||
|
generation = smp_tlb_generation;
|
||||||
invlpg(smp_tlb_addr1);
|
invlpg(smp_tlb_addr1);
|
||||||
atomic_add_int(&smp_tlb_wait, 1);
|
PCPU_SET(smp_tlb_done, generation);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
invlrng_handler(void)
|
invlrng_handler(void)
|
||||||
{
|
{
|
||||||
vm_offset_t addr;
|
vm_offset_t addr, addr2;
|
||||||
|
uint64_t generation;
|
||||||
|
|
||||||
#ifdef COUNT_XINVLTLB_HITS
|
#ifdef COUNT_XINVLTLB_HITS
|
||||||
xhits_rng[PCPU_GET(cpuid)]++;
|
xhits_rng[PCPU_GET(cpuid)]++;
|
||||||
@ -1514,10 +1530,12 @@ invlrng_handler(void)
|
|||||||
#endif /* COUNT_IPIS */
|
#endif /* COUNT_IPIS */
|
||||||
|
|
||||||
addr = smp_tlb_addr1;
|
addr = smp_tlb_addr1;
|
||||||
|
addr2 = smp_tlb_addr2;
|
||||||
|
generation = smp_tlb_generation;
|
||||||
do {
|
do {
|
||||||
invlpg(addr);
|
invlpg(addr);
|
||||||
addr += PAGE_SIZE;
|
addr += PAGE_SIZE;
|
||||||
} while (addr < smp_tlb_addr2);
|
} while (addr < addr2);
|
||||||
|
|
||||||
atomic_add_int(&smp_tlb_wait, 1);
|
PCPU_SET(smp_tlb_done, generation);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user