Kernel support for the Vector-Scalar eXtension (VSX) found on the POWER7

and POWER8. This instruction set unifies the 32 64-bit scalar floating point registers with the 32 128-bit vector registers into a single bank of 64 128-bit registers. Kernel support mostly amounts to saving and restoring the wider version of the floating point registers and making sure that both scalar FP and vector registers are enabled once a VSX instruction is executed. get_mcontext() and friends currently cannot see the high bits, which will require a little more work. As the system compiler (GCC 4.2) does not support VSX, making use of this from userland requires either newer GCC or clang. Relnotes: yes Sponsored by: FreeBSD Foundation
svn path=/head/; revision=279189
2024-12-18 10:35:55 +00:00 · 2015-02-22 21:40:27 +00:00 · 2015-02-22 21:40:27 +00:00 · 35f612b88a · 2020-12-20 02:59:44 +00:00
commit 35f612b88a
parent 4445af212c
13 changed files with 138 additions and 61 deletions
--- a/sys/powerpc/aim/trap.c
+++ b/sys/powerpc/aim/trap.c
@ -116,6 +116,7 @@ static struct powerpc_exception powerpc_exceptions[] = {
 	{ 0x0e00, "floating-point assist" },
 	{ 0x0f00, "performance monitoring" },
 	{ 0x0f20, "altivec unavailable" },
+	{ 0x0f40, "vsx unavailable" },
 	{ 0x1000, "instruction tlb miss" },
 	{ 0x1100, "data load tlb miss" },
 	{ 0x1200, "data store tlb miss" },
@ -230,6 +231,17 @@ trap(struct trapframe *frame)
 			enable_vec(td);
 			break;

+		case EXC_VSX:
+			KASSERT((td->td_pcb->pcb_flags & PCB_VSX) != PCB_VSX,
+			    ("VSX already enabled for thread"));
+			if (!(td->td_pcb->pcb_flags & PCB_VEC))
+				enable_vec(td);
+			if (!(td->td_pcb->pcb_flags & PCB_FPU))
+				save_fpu(td);
+			td->td_pcb->pcb_flags |= PCB_VSX;
+			enable_fpu(td);
+			break;
+
 		case EXC_VECAST_G4:
 		case EXC_VECAST_G5:
 			/*
@ -709,7 +721,7 @@ fix_unaligned(struct thread *td, struct trapframe *frame)
 	case EXC_ALI_LFD:
 	case EXC_ALI_STFD:
 		reg = EXC_ALI_RST(frame->cpu.aim.dsisr);
-		fpr = &td->td_pcb->pcb_fpu.fpr[reg];
+		fpr = &td->td_pcb->pcb_fpu.fpr[reg].fpr;
 		fputhread = PCPU_GET(fputhread);

 		/* Juggle the FPU to ensure that we've initialized
--- a/sys/powerpc/aim/trap_subr64.S
+++ b/sys/powerpc/aim/trap_subr64.S
@ -359,7 +359,7 @@ CNAME(trapcode):
 	li	%r1,TRAP_GENTRAP
 	ld	%r1,0(%r1)
 	mtlr	%r1
-	li	%r1, 0xA0		/* How to get the vector from LR */
+	li	%r1, 0xe0		/* How to get the vector from LR */
 	blrl				/* Branch to generictrap */
 CNAME(trapcodeend):

--- a/sys/powerpc/fpu/fpu_emu.c
+++ b/sys/powerpc/fpu/fpu_emu.c
@ -335,7 +335,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 				if (ra != 0)
 					addr += tf->fixreg[ra];
 				rt = instr.i_x.i_rt;
-				a = (int *)&fs->fpreg[rt];
+				a = (int *)&fs->fpreg[rt].fpr;
 				DPRINTF(FPE_INSN,
 					("fpu_execute: Store INT %x at %p\n",
 						a[1], (void *)addr));
@ -402,7 +402,8 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 				DPRINTF(FPE_INSN, 
 					("fpu_execute: Store DBL at %p\n",
 						(void *)addr));
-				if (copyout(&fs->fpreg[rt], (void *)addr, size))
+				if (copyout(&fs->fpreg[rt].fpr, (void *)addr,
+				    size))
 					return (FAULT);
 			}
 		} else {
@ -410,12 +411,13 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 			FPU_EMU_EVCNT_INCR(fpload);
 			DPRINTF(FPE_INSN, ("fpu_execute: Load from %p\n",
 				(void *)addr));
-			if (copyin((const void *)addr, &fs->fpreg[rt], size))
+			if (copyin((const void *)addr, &fs->fpreg[rt].fpr,
+			    size))
 				return (FAULT);
 			if (type != FTYPE_DBL) {
 				fpu_explode(fe, fp = &fe->fe_f1, type, rt);
 				fpu_implode(fe, fp, FTYPE_DBL, 
-					(u_int *)&fs->fpreg[rt]);
+					(u_int *)&fs->fpreg[rt].fpr);
 			}
 		}
 		if (update) 
@ -468,7 +470,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 				DPRINTF(FPE_INSN, ("fpu_execute: FRSP\n"));
 				fpu_explode(fe, fp = &fe->fe_f1, FTYPE_DBL, rb);
 				fpu_implode(fe, fp, FTYPE_SNG, 
-					(u_int *)&fs->fpreg[rt]);
+					(u_int *)&fs->fpreg[rt].fpr);
 				fpu_explode(fe, fp = &fe->fe_f1, FTYPE_SNG, rt);
 				type = FTYPE_DBL;
 				break;
@ -501,9 +503,9 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 			case	OPC63_FNEG:
 				FPU_EMU_EVCNT_INCR(fnegabs);
 				DPRINTF(FPE_INSN, ("fpu_execute: FNEGABS\n"));
-				memcpy(&fs->fpreg[rt], &fs->fpreg[rb],
+				memcpy(&fs->fpreg[rt].fpr, &fs->fpreg[rb].fpr,
 					sizeof(double));
-				a = (int *)&fs->fpreg[rt];
+				a = (int *)&fs->fpreg[rt].fpr;
 				*a ^= (1U << 31);
 				break;
 			case	OPC63_MCRFS:
@ -531,7 +533,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 			case	OPC63_FMR:
 				FPU_EMU_EVCNT_INCR(fmr);
 				DPRINTF(FPE_INSN, ("fpu_execute: FMR\n"));
-				memcpy(&fs->fpreg[rt], &fs->fpreg[rb],
+				memcpy(&fs->fpreg[rt].fpr, &fs->fpreg[rb].fpr,
 					sizeof(double));
 				break;
 			case	OPC63_MTFSFI:
@ -548,23 +550,23 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 			case	OPC63_FNABS:
 				FPU_EMU_EVCNT_INCR(fnabs);
 				DPRINTF(FPE_INSN, ("fpu_execute: FABS\n"));
-				memcpy(&fs->fpreg[rt], &fs->fpreg[rb],
+				memcpy(&fs->fpreg[rt].fpr, &fs->fpreg[rb].fpr,
 					sizeof(double));
-				a = (int *)&fs->fpreg[rt];
+				a = (int *)&fs->fpreg[rt].fpr;
 				*a |= (1U << 31);
 				break;
 			case	OPC63_FABS:
 				FPU_EMU_EVCNT_INCR(fabs);
 				DPRINTF(FPE_INSN, ("fpu_execute: FABS\n"));
-				memcpy(&fs->fpreg[rt], &fs->fpreg[rb],
+				memcpy(&fs->fpreg[rt].fpr, &fs->fpreg[rb].fpr,
 					sizeof(double));
-				a = (int *)&fs->fpreg[rt];
+				a = (int *)&fs->fpreg[rt].fpr;
 				*a &= ~(1U << 31);
 				break;
 			case	OPC63_MFFS:
 				FPU_EMU_EVCNT_INCR(mffs);
 				DPRINTF(FPE_INSN, ("fpu_execute: MFFS\n"));
-				memcpy(&fs->fpreg[rt], &fs->fpscr,
+				memcpy(&fs->fpreg[rt].fpr, &fs->fpscr,
 					sizeof(fs->fpscr));
 				break;
 			case	OPC63_MTFSF:
@ -579,7 +581,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 						if (rt & (1<<ra))
 							mask |= (0xf<<(4*ra));
 				}
-				a = (int *)&fs->fpreg[rt];
+				a = (int *)&fs->fpreg[rt].fpr;
 				fe->fe_cx = mask & a[1];
 				fe->fe_fpscr = (fe->fe_fpscr&~mask) | 
 					(fe->fe_cx);
@ -646,12 +648,12 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 			case	OPC63M_FSEL:
 				FPU_EMU_EVCNT_INCR(fsel);
 				DPRINTF(FPE_INSN, ("fpu_execute: FSEL\n"));
-				a = (int *)&fe->fe_fpstate->fpreg[ra];
+				a = (int *)&fe->fe_fpstate->fpreg[ra].fpr;
 				if ((*a & 0x80000000) && (*a & 0x7fffffff)) 
 					/* fra < 0 */
 					rc = rb;
 				DPRINTF(FPE_INSN, ("f%d => f%d\n", rc, rt));
-				memcpy(&fs->fpreg[rt], &fs->fpreg[rc],
+				memcpy(&fs->fpreg[rt].fpr, &fs->fpreg[rc].fpr,
 					sizeof(double));
 				break;
 			case	OPC59_FRES:
@ -660,7 +662,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 				fpu_explode(fe, &fe->fe_f1, type, rb);
 				fp = fpu_sqrt(fe);
 				/* now we've gotta overwrite the dest reg */
-				*((int *)&fe->fe_fpstate->fpreg[rt]) = 1;
+				*((int *)&fe->fe_fpstate->fpreg[rt].fpr) = 1;
 				fpu_explode(fe, &fe->fe_f1, FTYPE_INT, rt);
 				fpu_div(fe);
 				break;
@ -679,7 +681,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 				fp = fpu_sqrt(fe);
 				fe->fe_f2 = *fp;
 				/* now we've gotta overwrite the dest reg */
-				*((int *)&fe->fe_fpstate->fpreg[rt]) = 1;
+				*((int *)&fe->fe_fpstate->fpreg[rt].fpr) = 1;
 				fpu_explode(fe, &fe->fe_f1, FTYPE_INT, rt);
 				fpu_div(fe);
 				break;
@ -735,7 +737,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 			/* If the instruction was single precision, round */
 			if (!(instr.i_any.i_opcd & 0x4)) {
 				fpu_implode(fe, fp, FTYPE_SNG, 
-					(u_int *)&fs->fpreg[rt]);
+					(u_int *)&fs->fpreg[rt].fpr);
 				fpu_explode(fe, fp = &fe->fe_f1, FTYPE_SNG, rt);
 			}
 		}
@ -750,7 +752,7 @@ fpu_execute(struct trapframe *tf, struct fpemu *fe, union instr *insn)
 	 * Otherwise set new current exceptions and accrue.
 	 */
 	if (fp)
-		fpu_implode(fe, fp, type, (u_int *)&fs->fpreg[rt]);
+		fpu_implode(fe, fp, type, (u_int *)&fs->fpreg[rt].fpr);
 	cx = fe->fe_cx;
 	fsr = fe->fe_fpscr;
 	if (cx != 0) {
--- a/sys/powerpc/fpu/fpu_explode.c
+++ b/sys/powerpc/fpu/fpu_explode.c
@ -211,9 +211,9 @@ fpu_explode(struct fpemu *fe, struct fpn *fp, int type, int reg)
 	u_int s, *space;
 	u_int64_t l, *xspace;

-	xspace = (u_int64_t *)&fe->fe_fpstate->fpreg[reg];
+	xspace = (u_int64_t *)&fe->fe_fpstate->fpreg[reg].fpr;
 	l = xspace[0];
-	space = (u_int *)&fe->fe_fpstate->fpreg[reg];
+	space = (u_int *)&fe->fe_fpstate->fpreg[reg].fpr;
 	s = space[0];
 	fp->fp_sign = s >> 31;
 	fp->fp_sticky = 0;
--- a/sys/powerpc/include/cpu.h
+++ b/sys/powerpc/include/cpu.h
@ -55,10 +55,12 @@ extern int cpu_features;
 #define	PPC_FEATURE_HAS_FPU	0x08000000
 #define	PPC_FEATURE_HAS_MMU	0x04000000
 #define PPC_FEATURE_UNIFIED_CACHE 0x01000000
+#define PPC_FEATURE_HAS_VSX	0x00000080

 #define	PPC_FEATURE_BITMASK						\
 	"\20"								\
-	"\040PPC32\037PPC64\035ALTIVEC\034FPU\033MMU\031UNIFIEDCACHE"
+	"\040PPC32\037PPC64\035ALTIVEC\034FPU\033MMU\031UNIFIEDCACHE"	\
+	"\010VSX"

 #define	TRAPF_USERMODE(frame)	(((frame)->srr1 & PSL_PR) != 0)
 #define	TRAPF_PC(frame)		((frame)->srr0)
--- a/sys/powerpc/include/pcb.h
+++ b/sys/powerpc/include/pcb.h
@ -50,8 +50,12 @@ struct pcb {
 #define	PCB_FPU		1	/* Process uses FPU */
 #define	PCB_FPREGS	2	/* Process had FPU registers initialized */
 #define	PCB_VEC		4	/* Process had Altivec initialized */
+#define	PCB_VSX		8	/* Process had VSX initialized */
 	struct fpu {
-		double	fpr[32];
+		union {
+			double fpr;
+			uint32_t vsr[4];
+		} fpr[32];
 		double	fpscr;	/* FPSCR stored as double for easier access */
 	} pcb_fpu;		/* Floating point processor */
 	unsigned int	pcb_fpcpu;		/* which CPU had our FPU
--- a/sys/powerpc/include/psl.h
+++ b/sys/powerpc/include/psl.h
@ -39,6 +39,7 @@
 * Machine State Register (MSR) - All cores
 */
 #define	PSL_VEC		0x02000000UL	/* AltiVec/SPE vector unit available */
+#define	PSL_VSX		0x00800000UL	/* Vector-Scalar unit available */
 #define	PSL_EE		0x00008000UL	/* external interrupt enable */
 #define	PSL_PR		0x00004000UL	/* privilege mode (1 == user) */
 #define	PSL_FP		0x00002000UL	/* floating point enable */
--- a/sys/powerpc/include/reg.h
+++ b/sys/powerpc/include/reg.h
@ -20,7 +20,10 @@ struct reg {

 /* Must match pcb.pcb_fpu */
 struct fpreg {
-	double fpreg[32];
+	union {
+		double fpr;
+		uint64_t vsr[2];
+	} fpreg[32];
 	double fpscr;
 };

--- a/sys/powerpc/include/trap.h
+++ b/sys/powerpc/include/trap.h
@ -74,6 +74,9 @@
 #define	EXC_DLMISS	0x1100		/* Data load translation miss */
 #define	EXC_DSMISS	0x1200		/* Data store translation miss */

+/* Power ISA 2.06+: */
+#define	EXC_VSX		0x0f40		/* VSX Unavailable */
+
 /* The following are available on 4xx and 85xx */
 #define	EXC_CRIT	0x0100		/* Critical Input Interrupt */
 #define	EXC_PIT		0x1000		/* Programmable Interval Timer */
--- a/sys/powerpc/powerpc/cpu.c
+++ b/sys/powerpc/powerpc/cpu.c
@ -141,17 +141,17 @@ static const struct cputab models[] = {
 	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU,
 	   NULL },
        { "IBM POWER7",		IBMPOWER7,	REVFMT_MAJMIN,
-	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU,
-	   NULL },
+	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU |
+	   PPC_FEATURE_HAS_VSX, NULL },
        { "IBM POWER7+",	IBMPOWER7PLUS,	REVFMT_MAJMIN,
-	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU,
-	   NULL },
+	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU |
+	   PPC_FEATURE_HAS_VSX, NULL },
        { "IBM POWER8E",	IBMPOWER8E,	REVFMT_MAJMIN,
-	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU,
-	   NULL },
+	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU |
+	   PPC_FEATURE_HAS_VSX, NULL },
        { "IBM POWER8",		IBMPOWER8,	REVFMT_MAJMIN,
-	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU,
-	   NULL },
+	   PPC_FEATURE_64 | PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU |
+	   PPC_FEATURE_HAS_VSX, NULL },
        { "Motorola PowerPC 7400",	MPC7400,	REVFMT_MAJMIN,
 	   PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_FPU, cpu_6xx_setup },
        { "Motorola PowerPC 7410",	MPC7410,	REVFMT_MAJMIN,
--- a/sys/powerpc/powerpc/db_trace.c
+++ b/sys/powerpc/powerpc/db_trace.c
@ -252,6 +252,7 @@ db_backtrace(struct thread *td, db_addr_t fp, int count)
 			case EXC_FPU: trapstr = "FPU"; break;
 			case EXC_DECR: trapstr = "DECR"; break;
 			case EXC_PERF: trapstr = "PERF"; break;
+			case EXC_VSX: trapstr = "VSX"; break;
 			default: trapstr = NULL; break;
 			}
 			if (trapstr != NULL) {
--- a/sys/powerpc/powerpc/exec_machdep.c
+++ b/sys/powerpc/powerpc/exec_machdep.c
@ -373,6 +373,7 @@ static int
 grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
+	int i;

 	pcb = td->td_pcb;

@ -403,6 +404,9 @@ grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 		mcp->mc_flags |= _MC_FP_VALID;
 		memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
+		for (i = 0; i < 32; i++)
+			memcpy(&mcp->mc_fpreg[i], &pcb->pcb_fpu.fpr[i].fpr,
+			    sizeof(double));
 	}

 	/*
@ -421,6 +425,8 @@ grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 		memcpy(mcp->mc_avec, pcb->pcb_vec.vr, sizeof(mcp->mc_avec));
 	}

+	/* XXX VSX context */
+
 	mcp->mc_len = sizeof(*mcp);

 	return (0);
@ -447,6 +453,7 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
 	struct pcb *pcb;
 	struct trapframe *tf;
 	register_t tls;
+	int i;

 	pcb = td->td_pcb;
 	tf = td->td_frame;
@ -476,7 +483,10 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
 		/* enable_fpu() will happen lazily on a fault */
 		pcb->pcb_flags |= PCB_FPREGS;
 		memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
-		memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
+		bzero(pcb->pcb_fpu.fpr, sizeof(pcb->pcb_fpu.fpr));
+		for (i = 0; i < 32; i++)
+			memcpy(&pcb->pcb_fpu.fpr[i].fpr, &mcp->mc_fpreg[i],
+			    sizeof(double));
 	}

 	if (mcp->mc_flags & _MC_AV_VALID) {
@ -490,6 +500,8 @@ set_mcontext(struct thread *td, mcontext_t *mcp)
 		memcpy(pcb->pcb_vec.vr, mcp->mc_avec, sizeof(mcp->mc_avec));
 	}

+	/* XXX VSX context */
+
 	return (0);
 }

--- a/sys/powerpc/powerpc/fpu.c
+++ b/sys/powerpc/powerpc/fpu.c
@ -67,7 +67,10 @@ enable_fpu(struct thread *td)
 	 * to indicate that the FPU is in use.
 	 */
 	pcb->pcb_flags |= PCB_FPU;
-	tf->srr1 |= PSL_FP;
+	if (pcb->pcb_flags & PCB_VSX)
+		tf->srr1 |= PSL_FP | PSL_VSX;
+	else
+		tf->srr1 |= PSL_FP;
 	if (!(pcb->pcb_flags & PCB_FPREGS)) {
 		memset(&pcb->pcb_fpu, 0, sizeof pcb->pcb_fpu);
 		pcb->pcb_flags |= PCB_FPREGS;
@ -78,7 +81,10 @@ enable_fpu(struct thread *td)
 	 * can be restored.
 	 */
 	msr = mfmsr();
-	mtmsr(msr | PSL_FP);
+	if (pcb->pcb_flags & PCB_VSX)
+		mtmsr(msr | PSL_FP | PSL_VSX);
+	else
+		mtmsr(msr | PSL_FP);
 	isync();

 	/*
@ -89,17 +95,31 @@ enable_fpu(struct thread *td)
 	__asm __volatile ("lfd 0,0(%0); mtfsf 0xff,0"
 			  :: "b"(&pcb->pcb_fpu.fpscr));

-#define LFP(n)   __asm ("lfd " #n ", 0(%0)" \
-		:: "b"(&pcb->pcb_fpu.fpr[n]));
-	LFP(0);		LFP(1);		LFP(2);		LFP(3);
-	LFP(4);		LFP(5);		LFP(6);		LFP(7);
-	LFP(8);		LFP(9);		LFP(10);	LFP(11);
-	LFP(12);	LFP(13);	LFP(14);	LFP(15);
-	LFP(16);	LFP(17);	LFP(18);	LFP(19);
-	LFP(20);	LFP(21);	LFP(22);	LFP(23);
-	LFP(24);	LFP(25);	LFP(26);	LFP(27);
-	LFP(28);	LFP(29);	LFP(30);	LFP(31);
-#undef LFP
+	if (pcb->pcb_flags & PCB_VSX) {
+	#define LFP(n)   __asm ("lxvw4x " #n ", 0,%0" \
+			:: "b"(&pcb->pcb_fpu.fpr[n]));
+		LFP(0);		LFP(1);		LFP(2);		LFP(3);
+		LFP(4);		LFP(5);		LFP(6);		LFP(7);
+		LFP(8);		LFP(9);		LFP(10);	LFP(11);
+		LFP(12);	LFP(13);	LFP(14);	LFP(15);
+		LFP(16);	LFP(17);	LFP(18);	LFP(19);
+		LFP(20);	LFP(21);	LFP(22);	LFP(23);
+		LFP(24);	LFP(25);	LFP(26);	LFP(27);
+		LFP(28);	LFP(29);	LFP(30);	LFP(31);
+	#undef LFP
+	} else {
+	#define LFP(n)   __asm ("lfd " #n ", 0(%0)" \
+			:: "b"(&pcb->pcb_fpu.fpr[n]));
+		LFP(0);		LFP(1);		LFP(2);		LFP(3);
+		LFP(4);		LFP(5);		LFP(6);		LFP(7);
+		LFP(8);		LFP(9);		LFP(10);	LFP(11);
+		LFP(12);	LFP(13);	LFP(14);	LFP(15);
+		LFP(16);	LFP(17);	LFP(18);	LFP(19);
+		LFP(20);	LFP(21);	LFP(22);	LFP(23);
+		LFP(24);	LFP(25);	LFP(26);	LFP(27);
+		LFP(28);	LFP(29);	LFP(30);	LFP(31);
+	#undef LFP
+	}

 	isync();
 	mtmsr(msr);
@ -117,23 +137,40 @@ save_fpu(struct thread *td)
 	 * Temporarily re-enable floating-point during the save
 	 */
 	msr = mfmsr();
-	mtmsr(msr | PSL_FP);
+	if (pcb->pcb_flags & PCB_VSX)
+		mtmsr(msr | PSL_FP | PSL_VSX);
+	else
+		mtmsr(msr | PSL_FP);
 	isync();

 	/*
 	 * Save the floating-point registers and FPSCR to the PCB
 	 */
-#define SFP(n)   __asm ("stfd " #n ", 0(%0)" \
-		:: "b"(&pcb->pcb_fpu.fpr[n]));
-	SFP(0);		SFP(1);		SFP(2);		SFP(3);
-	SFP(4);		SFP(5);		SFP(6);		SFP(7);
-	SFP(8);		SFP(9);		SFP(10);	SFP(11);
-	SFP(12);	SFP(13);	SFP(14);	SFP(15);
-	SFP(16);	SFP(17);	SFP(18);	SFP(19);
-	SFP(20);	SFP(21);	SFP(22);	SFP(23);
-	SFP(24);	SFP(25);	SFP(26);	SFP(27);
-	SFP(28);	SFP(29);	SFP(30);	SFP(31);
-#undef SFP
+	if (pcb->pcb_flags & PCB_VSX) {
+	#define SFP(n)   __asm ("stxvw4x " #n ", 0,%0" \
+			:: "b"(&pcb->pcb_fpu.fpr[n]));
+		SFP(0);		SFP(1);		SFP(2);		SFP(3);
+		SFP(4);		SFP(5);		SFP(6);		SFP(7);
+		SFP(8);		SFP(9);		SFP(10);	SFP(11);
+		SFP(12);	SFP(13);	SFP(14);	SFP(15);
+		SFP(16);	SFP(17);	SFP(18);	SFP(19);
+		SFP(20);	SFP(21);	SFP(22);	SFP(23);
+		SFP(24);	SFP(25);	SFP(26);	SFP(27);
+		SFP(28);	SFP(29);	SFP(30);	SFP(31);
+	#undef SFP
+	} else {
+	#define SFP(n)   __asm ("stfd " #n ", 0(%0)" \
+			:: "b"(&pcb->pcb_fpu.fpr[n]));
+		SFP(0);		SFP(1);		SFP(2);		SFP(3);
+		SFP(4);		SFP(5);		SFP(6);		SFP(7);
+		SFP(8);		SFP(9);		SFP(10);	SFP(11);
+		SFP(12);	SFP(13);	SFP(14);	SFP(15);
+		SFP(16);	SFP(17);	SFP(18);	SFP(19);
+		SFP(20);	SFP(21);	SFP(22);	SFP(23);
+		SFP(24);	SFP(25);	SFP(26);	SFP(27);
+		SFP(28);	SFP(29);	SFP(30);	SFP(31);
+	#undef SFP
+	}
 	__asm __volatile ("mffs 0; stfd 0,0(%0)" :: "b"(&pcb->pcb_fpu.fpscr));

 	/*