From: Martin Schwidefsky Date: Wed, 31 Dec 2008 14:11:42 +0000 (+0100) Subject: [PATCH] fast vdso implementation for CLOCK_THREAD_CPUTIME_ID X-Git-Tag: v2.6.29-rc1~527^2 X-Git-Url: http://ftp.safe.ca/?a=commitdiff_plain;h=c742b31c03f37c5c499178f09f57381aa6c70131;p=safe%2Fjmp%2Flinux-2.6 [PATCH] fast vdso implementation for CLOCK_THREAD_CPUTIME_ID The extract cpu time instruction (ectg) instruction allows the user process to get the current thread cputime without calling into the kernel. The code that uses the instruction needs to switch to the access registers mode to get access to the per-cpu info page that contains the two base values that are needed to calculate the current cputime from the CPU timer with the ectg instruction. Signed-off-by: Martin Schwidefsky --- diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index a547817..ffdef5f 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -106,8 +106,10 @@ #define __LC_IPLDEV 0xDB8 #define __LC_CURRENT 0xDD8 #define __LC_INT_CLOCK 0xDE8 +#define __LC_VDSO_PER_CPU 0xE38 #endif /* __s390x__ */ +#define __LC_PASTE 0xE40 #define __LC_PANIC_MAGIC 0xE00 #ifndef __s390x__ @@ -381,7 +383,12 @@ struct _lowcore /* whether the kernel died with panic() or not */ __u32 panic_magic; /* 0xe00 */ - __u8 pad13[0x11b8-0xe04]; /* 0xe04 */ + /* Per cpu primary space access list */ + __u8 pad_0xe04[0xe3c-0xe04]; /* 0xe04 */ + __u32 vdso_per_cpu_data; /* 0xe3c */ + __u32 paste[16]; /* 0xe40 */ + + __u8 pad13[0x11b8-0xe80]; /* 0xe80 */ /* 64 bit extparam used for pfault, diag 250 etc */ __u64 ext_params2; /* 0x11B8 */ diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index a44f4fe..7bdd7c8 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -12,9 +12,9 @@ #ifndef __ASSEMBLY__ /* - * Note about this structure: + * Note about the vdso_data and vdso_per_cpu_data structures: * - * NEVER USE THIS IN USERSPACE CODE DIRECTLY. The layout of this + * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the * structure is supposed to be known only to the function in the vdso * itself and may change without notice. */ @@ -28,10 +28,21 @@ struct vdso_data { __u64 wtom_clock_nsec; /* 0x28 */ __u32 tz_minuteswest; /* Minutes west of Greenwich 0x30 */ __u32 tz_dsttime; /* Type of dst correction 0x34 */ + __u32 ectg_available; +}; + +struct vdso_per_cpu_data { + __u64 ectg_timer_base; + __u64 ectg_user_time; }; extern struct vdso_data *vdso_data; +#ifdef CONFIG_64BIT +int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore); +void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore); +#endif + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index e641f60..67a6001 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -48,6 +48,11 @@ int main(void) DEFINE(__VDSO_WTOM_SEC, offsetof(struct vdso_data, wtom_clock_sec)); DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest)); + DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available)); + DEFINE(__VDSO_ECTG_BASE, + offsetof(struct vdso_per_cpu_data, ectg_timer_base)); + DEFINE(__VDSO_ECTG_USER, + offsetof(struct vdso_per_cpu_data, ectg_user_time)); /* constants used by the vdso */ DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index ae83c19..c6fbde1 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -177,8 +177,11 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ .if !\sync ni \psworg+1,0xfd # clear wait state bit .endif - lmg %r0,%r15,SP_R0(%r15) # load gprs 0-15 of user + lg %r14,__LC_VDSO_PER_CPU + lmg %r0,%r13,SP_R0(%r15) # load gprs 0-13 of user stpt __LC_EXIT_TIMER + mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER + lmg %r14,%r15,SP_R14(%r15) # load grps 14-15 of user lpswe \psworg # back to caller .endm @@ -980,23 +983,23 @@ cleanup_sysc_return: cleanup_sysc_leave: clc 8(8,%r12),BASED(cleanup_sysc_leave_insn) - je 2f - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + je 3f clc 8(8,%r12),BASED(cleanup_sysc_leave_insn+8) - je 2f - mvc __LC_RETURN_PSW(16),SP_PSW(%r15) + jhe 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER +0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15) cghi %r12,__LC_MCK_OLD_PSW - jne 0f + jne 1f mvc __LC_SAVE_AREA+64(32),SP_R12(%r15) - j 1f -0: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) -1: lmg %r0,%r11,SP_R0(%r15) + j 2f +1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) +2: lmg %r0,%r11,SP_R0(%r15) lg %r15,SP_R15(%r15) -2: la %r12,__LC_RETURN_PSW +3: la %r12,__LC_RETURN_PSW br %r14 cleanup_sysc_leave_insn: .quad sysc_done - 4 - .quad sysc_done - 8 + .quad sysc_done - 16 cleanup_io_return: mvc __LC_RETURN_PSW(8),0(%r12) @@ -1006,23 +1009,23 @@ cleanup_io_return: cleanup_io_leave: clc 8(8,%r12),BASED(cleanup_io_leave_insn) - je 2f - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + je 3f clc 8(8,%r12),BASED(cleanup_io_leave_insn+8) - je 2f - mvc __LC_RETURN_PSW(16),SP_PSW(%r15) + jhe 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER +0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15) cghi %r12,__LC_MCK_OLD_PSW - jne 0f + jne 1f mvc __LC_SAVE_AREA+64(32),SP_R12(%r15) - j 1f -0: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) -1: lmg %r0,%r11,SP_R0(%r15) + j 2f +1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) +2: lmg %r0,%r11,SP_R0(%r15) lg %r15,SP_R15(%r15) -2: la %r12,__LC_RETURN_PSW +3: la %r12,__LC_RETURN_PSW br %r14 cleanup_io_leave_insn: .quad io_done - 4 - .quad io_done - 8 + .quad io_done - 16 /* * Integer constants diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 3ccd36b..f9f70aa 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -87,6 +87,8 @@ startup_continue: lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore mvc __LC_IPLDEV(4),IPL_DEVICE+4-PARMAREA(%r12) + lghi %r0,__LC_PASTE + stg %r0,__LC_VDSO_PER_CPU # # Setup stack # diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b7a1efd..d825f49 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -427,6 +427,8 @@ setup_lowcore(void) /* enable extended save area */ __ctl_set_bit(14, 29); } +#else + lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0]; #endif set_prefix((u32)(unsigned long) lc); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 3979a6f..b3461e8 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "entry.h" /* @@ -506,6 +507,9 @@ static int __cpuinit smp_alloc_lowcore(int cpu) goto out; lowcore->extended_save_area_addr = (u32) save_area; } +#else + if (vdso_alloc_per_cpu(cpu, lowcore)) + goto out; #endif lowcore_ptr[cpu] = lowcore; return 0; @@ -528,6 +532,8 @@ static void smp_free_lowcore(int cpu) #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) free_page((unsigned long) lowcore->extended_save_area_addr); +#else + vdso_free_per_cpu(cpu, lowcore); #endif free_page(lowcore->panic_stack - PAGE_SIZE); free_pages(lowcore->async_stack - ASYNC_SIZE, ASYNC_ORDER); @@ -670,6 +676,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) lowcore = (void *) __get_free_pages(GFP_KERNEL | GFP_DMA, lc_order); panic_stack = __get_free_page(GFP_KERNEL); async_stack = __get_free_pages(GFP_KERNEL, ASYNC_ORDER); + BUG_ON(!lowcore || !panic_stack || !async_stack); #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) save_area = get_zeroed_page(GFP_KERNEL); @@ -683,6 +690,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) lowcore->extended_save_area_addr = (u32) save_area; +#else + BUG_ON(vdso_alloc_per_cpu(smp_processor_id(), lowcore)); #endif set_prefix((u32)(unsigned long) lowcore); local_mcck_enable(); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 10a6cce..25a6a82 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -31,9 +31,6 @@ #include #include -/* Max supported size for symbol names */ -#define MAX_SYMNAME 64 - #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) extern char vdso32_start, vdso32_end; static void *vdso32_kbase = &vdso32_start; @@ -71,6 +68,119 @@ static union { struct vdso_data *vdso_data = &vdso_data_store.data; /* + * Setup vdso data page. + */ +static void vdso_init_data(struct vdso_data *vd) +{ + unsigned int facility_list; + + facility_list = stfl(); + vd->ectg_available = switch_amode && (facility_list & 1); +} + +#ifdef CONFIG_64BIT +/* + * Setup per cpu vdso data page. + */ +static void vdso_init_per_cpu_data(int cpu, struct vdso_per_cpu_data *vpcd) +{ +} + +/* + * Allocate/free per cpu vdso data. + */ +#ifdef CONFIG_64BIT +#define SEGMENT_ORDER 2 +#else +#define SEGMENT_ORDER 1 +#endif + +int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore) +{ + unsigned long segment_table, page_table, page_frame; + u32 *psal, *aste; + int i; + + lowcore->vdso_per_cpu_data = __LC_PASTE; + + if (!switch_amode || !vdso_enabled) + return 0; + + segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); + page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA); + page_frame = get_zeroed_page(GFP_KERNEL); + if (!segment_table || !page_table || !page_frame) + goto out; + + clear_table((unsigned long *) segment_table, _SEGMENT_ENTRY_EMPTY, + PAGE_SIZE << SEGMENT_ORDER); + clear_table((unsigned long *) page_table, _PAGE_TYPE_EMPTY, + 256*sizeof(unsigned long)); + + *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; + *(unsigned long *) page_table = _PAGE_RO + page_frame; + + psal = (u32 *) (page_table + 256*sizeof(unsigned long)); + aste = psal + 32; + + for (i = 4; i < 32; i += 4) + psal[i] = 0x80000000; + + lowcore->paste[4] = (u32)(addr_t) psal; + psal[0] = 0x20000000; + psal[2] = (u32)(addr_t) aste; + *(unsigned long *) (aste + 2) = segment_table + + _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT; + aste[4] = (u32)(addr_t) psal; + lowcore->vdso_per_cpu_data = page_frame; + + vdso_init_per_cpu_data(cpu, (struct vdso_per_cpu_data *) page_frame); + return 0; + +out: + free_page(page_frame); + free_page(page_table); + free_pages(segment_table, SEGMENT_ORDER); + return -ENOMEM; +} + +#ifdef CONFIG_HOTPLUG_CPU +void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore) +{ + unsigned long segment_table, page_table, page_frame; + u32 *psal, *aste; + + if (!switch_amode || !vdso_enabled) + return; + + psal = (u32 *)(addr_t) lowcore->paste[4]; + aste = (u32 *)(addr_t) psal[2]; + segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK; + page_table = *(unsigned long *) segment_table; + page_frame = *(unsigned long *) page_table; + + free_page(page_frame); + free_page(page_table); + free_pages(segment_table, SEGMENT_ORDER); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static void __vdso_init_cr5(void *dummy) +{ + unsigned long cr5; + + cr5 = offsetof(struct _lowcore, paste); + __ctl_load(cr5, 5, 5); +} + +static void vdso_init_cr5(void) +{ + if (switch_amode && vdso_enabled) + on_each_cpu(__vdso_init_cr5, NULL, 1); +} +#endif /* CONFIG_64BIT */ + +/* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree */ @@ -172,6 +282,9 @@ static int __init vdso_init(void) { int i; + if (!vdso_enabled) + return 0; + vdso_init_data(vdso_data); #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) /* Calculate the size of the 32 bit vDSO */ vdso32_pages = ((&vdso32_end - &vdso32_start @@ -208,6 +321,10 @@ static int __init vdso_init(void) } vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data); vdso64_pagelist[vdso64_pages] = NULL; +#ifndef CONFIG_SMP + BUG_ON(vdso_alloc_per_cpu(0, S390_lowcore)); +#endif + vdso_init_cr5(); #endif /* CONFIG_64BIT */ get_page(virt_to_page(vdso_data)); diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S index 488e31a..9ce8caa 100644 --- a/arch/s390/kernel/vdso64/clock_getres.S +++ b/arch/s390/kernel/vdso64/clock_getres.S @@ -22,7 +22,12 @@ __kernel_clock_getres: cghi %r2,CLOCK_REALTIME je 0f cghi %r2,CLOCK_MONOTONIC + je 0f + cghi %r2,-2 /* CLOCK_THREAD_CPUTIME_ID for this thread */ jne 2f + larl %r5,_vdso_data + icm %r0,15,__LC_ECTG_OK(%r5) + jz 2f 0: ltgr %r3,%r3 jz 1f /* res == NULL */ larl %r1,3f diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 738a410..79dbfee 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -22,8 +22,10 @@ __kernel_clock_gettime: larl %r5,_vdso_data cghi %r2,CLOCK_REALTIME je 4f + cghi %r2,-2 /* CLOCK_THREAD_CPUTIME_ID for this thread */ + je 9f cghi %r2,CLOCK_MONOTONIC - jne 9f + jne 12f /* CLOCK_MONOTONIC */ ltgr %r3,%r3 @@ -42,7 +44,7 @@ __kernel_clock_gettime: alg %r0,__VDSO_WTOM_SEC(%r5) clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ jne 0b - larl %r5,10f + larl %r5,13f 1: clg %r1,0(%r5) jl 2f slg %r1,0(%r5) @@ -68,7 +70,7 @@ __kernel_clock_gettime: lg %r0,__VDSO_XTIME_SEC(%r5) clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ jne 5b - larl %r5,10f + larl %r5,13f 6: clg %r1,0(%r5) jl 7f slg %r1,0(%r5) @@ -79,11 +81,38 @@ __kernel_clock_gettime: 8: lghi %r2,0 br %r14 + /* CLOCK_THREAD_CPUTIME_ID for this thread */ +9: icm %r0,15,__VDSO_ECTG_OK(%r5) + jz 12f + ear %r2,%a4 + llilh %r4,0x0100 + sar %a4,%r4 + lghi %r4,0 + sacf 512 /* Magic ectg instruction */ + .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 + sacf 0 + sar %a4,%r2 + algr %r1,%r0 /* r1 = cputime as TOD value */ + mghi %r1,1000 /* convert to nanoseconds */ + srlg %r1,%r1,12 /* r1 = cputime in nanosec */ + lgr %r4,%r1 + larl %r5,13f + srlg %r1,%r1,9 /* divide by 1000000000 */ + mlg %r0,8(%r5) + srlg %r0,%r0,11 /* r0 = tv_sec */ + stg %r0,0(%r3) + msg %r0,0(%r5) /* calculate tv_nsec */ + slgr %r4,%r0 /* r4 = tv_nsec */ + stg %r4,8(%r3) + lghi %r2,0 + br %r14 + /* Fallback to system call */ -9: lghi %r1,__NR_clock_gettime +12: lghi %r1,__NR_clock_gettime svc 0 br %r14 -10: .quad 1000000000 +13: .quad 1000000000 +14: .quad 19342813113834067 .cfi_endproc .size __kernel_clock_gettime,.-__kernel_clock_gettime