From: Stuart Menefy Date: Fri, 24 Nov 2006 02:42:24 +0000 (+0900) Subject: sh: TLB miss fast-path optimizations. X-Git-Tag: v2.6.20-rc1~34^2~20^2~32^2~23 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9b3a53ab76771e3669e50086c131e1574fe25847;p=linux-2.6 sh: TLB miss fast-path optimizations. Handle simple TLB miss faults which can be resolved completely from the page table in assembler. Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a03f155571..48308dc86e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -379,6 +379,9 @@ config CPU_HAS_SR_RB See for further information on SR.RB and register banking in the kernel in general. +config CPU_HAS_PTEA + bool + endmenu menu "Timer support" diff --git a/arch/sh/kernel/cpu/sh3/entry.S b/arch/sh/kernel/cpu/sh3/entry.S index 869d56fb7d..5de99b4987 100644 --- a/arch/sh/kernel/cpu/sh3/entry.S +++ b/arch/sh/kernel/cpu/sh3/entry.S @@ -13,8 +13,10 @@ #include #include #include -#include #include +#include +#include +#include ! NOTE: ! GNU as (as of 2.9.1) changes bf/s into bt/s and bra, when the address @@ -136,29 +138,14 @@ ENTRY(tlb_protection_violation_store) call_dpf: mov.l 1f, r0 - mov r5, r8 - mov.l @r0, r6 - mov r6, r9 - mov.l 2f, r0 - sts pr, r10 - jsr @r0 - mov r15, r4 - ! - tst r0, r0 - bf/s 0f - lds r10, pr - rts - nop -0: sti + mov.l @r0, r6 ! address mov.l 3f, r0 - mov r9, r6 - mov r8, r5 + sti jmp @r0 - mov r15, r4 + mov r15, r4 ! regs .align 2 1: .long MMU_TEA -2: .long __do_page_fault 3: .long do_page_fault .align 2 @@ -344,9 +331,176 @@ general_exception: 2: .long ret_from_exception ! ! + +/* This code makes some assumptions to improve performance. + * Make sure they are stil true. */ +#if PTRS_PER_PGD != PTRS_PER_PTE +#error PDG and PTE sizes don't match +#endif + +/* gas doesn't flag impossible values for mov #immediate as an error */ +#if (_PAGE_PRESENT >> 2) > 0x7f +#error cannot load PAGE_PRESENT as an immediate +#endif +#if _PAGE_DIRTY > 0x7f +#error cannot load PAGE_DIRTY as an immediate +#endif +#if (_PAGE_PRESENT << 2) != _PAGE_ACCESSED +#error cannot derive PAGE_ACCESSED from PAGE_PRESENT +#endif + +#if defined(CONFIG_CPU_SH4) +#define ldmmupteh(r) mov.l 8f, r +#else +#define ldmmupteh(r) mov #MMU_PTEH, r +#endif + .balign 1024,0,1024 tlb_miss: - mov.l 1f, k2 +#ifdef COUNT_EXCEPTIONS + ! Increment the counts + mov.l 9f, k1 + mov.l @k1, k2 + add #1, k2 + mov.l k2, @k1 +#endif + + ! k0 scratch + ! k1 pgd and pte pointers + ! k2 faulting address + ! k3 pgd and pte index masks + ! k4 shift + + ! Load up the pgd entry (k1) + + ldmmupteh(k0) ! 9 LS (latency=2) MMU_PTEH + + mov.w 4f, k3 ! 8 LS (latency=2) (PTRS_PER_PGD-1) << 2 + mov #-(PGDIR_SHIFT-2), k4 ! 6 EX + + mov.l @(MMU_TEA-MMU_PTEH,k0), k2 ! 18 LS (latency=2) + + mov.l @(MMU_TTB-MMU_PTEH,k0), k1 ! 18 LS (latency=2) + + mov k2, k0 ! 5 MT (latency=0) + shld k4, k0 ! 99 EX + + and k3, k0 ! 78 EX + + mov.l @(k0, k1), k1 ! 21 LS (latency=2) + mov #-(PAGE_SHIFT-2), k4 ! 6 EX + + ! Load up the pte entry (k2) + + mov k2, k0 ! 5 MT (latency=0) + shld k4, k0 ! 99 EX + + tst k1, k1 ! 86 MT + + bt 20f ! 110 BR + + and k3, k0 ! 78 EX + mov.w 5f, k4 ! 8 LS (latency=2) _PAGE_PRESENT + + mov.l @(k0, k1), k2 ! 21 LS (latency=2) + add k0, k1 ! 49 EX + +#ifdef CONFIG_CPU_HAS_PTEA + ! Test the entry for present and _PAGE_ACCESSED + + mov #-28, k3 ! 6 EX + mov k2, k0 ! 5 MT (latency=0) + + tst k4, k2 ! 68 MT + shld k3, k0 ! 99 EX + + bt 20f ! 110 BR + + ! Set PTEA register + ! MMU_PTEA = ((pteval >> 28) & 0xe) | (pteval & 0x1) + ! + ! k0=pte>>28, k1=pte*, k2=pte, k3=, k4=_PAGE_PRESENT + + and #0xe, k0 ! 79 EX + + mov k0, k3 ! 5 MT (latency=0) + mov k2, k0 ! 5 MT (latency=0) + + and #1, k0 ! 79 EX + + or k0, k3 ! 82 EX + + ldmmupteh(k0) ! 9 LS (latency=2) + shll2 k4 ! 101 EX _PAGE_ACCESSED + + tst k4, k2 ! 68 MT + + mov.l k3, @(MMU_PTEA-MMU_PTEH,k0) ! 27 LS + + mov.l 7f, k3 ! 9 LS (latency=2) _PAGE_FLAGS_HARDWARE_MASK + + ! k0=MMU_PTEH, k1=pte*, k2=pte, k3=_PAGE_FLAGS_HARDWARE, k4=_PAGE_ACCESSED +#else + + ! Test the entry for present and _PAGE_ACCESSED + + mov.l 7f, k3 ! 9 LS (latency=2) _PAGE_FLAGS_HARDWARE_MASK + tst k4, k2 ! 68 MT + + shll2 k4 ! 101 EX _PAGE_ACCESSED + ldmmupteh(k0) ! 9 LS (latency=2) + + bt 20f ! 110 BR + tst k4, k2 ! 68 MT + + ! k0=MMU_PTEH, k1=pte*, k2=pte, k3=_PAGE_FLAGS_HARDWARE, k4=_PAGE_ACCESSED + +#endif + + ! Set up the entry + + and k2, k3 ! 78 EX + bt/s 10f ! 108 BR + + mov.l k3, @(MMU_PTEL-MMU_PTEH,k0) ! 27 LS + + ldtlb ! 128 CO + + ! At least one instruction between ldtlb and rte + nop ! 119 NOP + + rte ! 126 CO + + nop ! 119 NOP + + +10: or k4, k2 ! 82 EX + + ldtlb ! 128 CO + + ! At least one instruction between ldtlb and rte + mov.l k2, @k1 ! 27 LS + + rte ! 126 CO + + ! Note we cannot execute mov here, because it is executed after + ! restoring SSR, so would be executed in user space. + nop ! 119 NOP + + + .align 5 + ! Once cache line if possible... +1: .long swapper_pg_dir +4: .short (PTRS_PER_PGD-1) << 2 +5: .short _PAGE_PRESENT +7: .long _PAGE_FLAGS_HARDWARE_MASK +8: .long MMU_PTEH +#ifdef COUNT_EXCEPTIONS +9: .long exception_count_miss +#endif + + ! Either pgd or pte not present +20: mov.l 1f, k2 mov.l 4f, k3 bra handle_exception mov.l @k2, k2 @@ -496,6 +650,15 @@ skip_save: bf interrupt_exception shlr2 r8 shlr r8 + +#ifdef COUNT_EXCEPTIONS + mov.l 5f, r9 + add r8, r9 + mov.l @r9, r10 + add #1, r10 + mov.l r10, @r9 +#endif + mov.l 4f, r9 add r8, r9 mov.l @r9, r9 @@ -509,6 +672,9 @@ skip_save: 2: .long 0x000080f0 ! FD=1, IMASK=15 3: .long 0xcfffffff ! RB=0, BL=0 4: .long exception_handling_table +#ifdef COUNT_EXCEPTIONS +5: .long exception_count_table +#endif interrupt_exception: mov.l 1f, r9 diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c index c294de1e14..afe0f1b1c0 100644 --- a/arch/sh/kernel/cpu/sh4/probe.c +++ b/arch/sh/kernel/cpu/sh4/probe.c @@ -79,16 +79,16 @@ int __init detect_cpu_and_cache_system(void) case 0x205: cpu_data->type = CPU_SH7750; cpu_data->flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU | - CPU_HAS_PERF_COUNTER | CPU_HAS_PTEA; + CPU_HAS_PERF_COUNTER; break; case 0x206: cpu_data->type = CPU_SH7750S; cpu_data->flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU | - CPU_HAS_PERF_COUNTER | CPU_HAS_PTEA; + CPU_HAS_PERF_COUNTER; break; case 0x1100: cpu_data->type = CPU_SH7751; - cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA; + cpu_data->flags |= CPU_HAS_FPU; break; case 0x2000: cpu_data->type = CPU_SH73180; @@ -126,23 +126,22 @@ int __init detect_cpu_and_cache_system(void) break; case 0x8000: cpu_data->type = CPU_ST40RA; - cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA; + cpu_data->flags |= CPU_HAS_FPU; break; case 0x8100: cpu_data->type = CPU_ST40GX1; - cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA; + cpu_data->flags |= CPU_HAS_FPU; break; case 0x700: cpu_data->type = CPU_SH4_501; cpu_data->icache.ways = 2; cpu_data->dcache.ways = 2; - cpu_data->flags |= CPU_HAS_PTEA; break; case 0x600: cpu_data->type = CPU_SH4_202; cpu_data->icache.ways = 2; cpu_data->dcache.ways = 2; - cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA; + cpu_data->flags |= CPU_HAS_FPU; break; case 0x500 ... 0x501: switch (prr) { @@ -160,7 +159,7 @@ int __init detect_cpu_and_cache_system(void) cpu_data->icache.ways = 2; cpu_data->dcache.ways = 2; - cpu_data->flags |= CPU_HAS_FPU | CPU_HAS_PTEA; + cpu_data->flags |= CPU_HAS_FPU; break; default: @@ -173,6 +172,10 @@ int __init detect_cpu_and_cache_system(void) cpu_data->dcache.ways = 1; #endif +#ifdef CONFIG_CPU_HAS_PTEA + cpu_data->flags |= CPU_HAS_PTEA; +#endif + /* * On anything that's not a direct-mapped cache, look to the CVR * for I/D-cache specifics. diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 88e9663fc9..6cd6d0045d 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -20,6 +20,7 @@ config CPU_SH4 bool select CPU_HAS_INTEVT select CPU_HAS_SR_RB + select CPU_HAS_PTEA if !CPU_SUBTYPE_ST40 config CPU_SH4A bool diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 128907ef7f..123fb80c85 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -223,89 +223,3 @@ do_sigbus: if (!user_mode(regs)) goto no_context; } - -#ifdef CONFIG_SH_STORE_QUEUES -/* - * This is a special case for the SH-4 store queues, as pages for this - * space still need to be faulted in before it's possible to flush the - * store queue cache for writeout to the remapped region. - */ -#define P3_ADDR_MAX (P4SEG_STORE_QUE + 0x04000000) -#else -#define P3_ADDR_MAX P4SEG -#endif - -/* - * Called with interrupts disabled. - */ -asmlinkage int __kprobes __do_page_fault(struct pt_regs *regs, - unsigned long writeaccess, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - pte_t entry; - struct mm_struct *mm = current->mm; - spinlock_t *ptl; - int ret = 1; - -#ifdef CONFIG_SH_KGDB - if (kgdb_nofault && kgdb_bus_err_hook) - kgdb_bus_err_hook(); -#endif - - /* - * We don't take page faults for P1, P2, and parts of P4, these - * are always mapped, whether it be due to legacy behaviour in - * 29-bit mode, or due to PMB configuration in 32-bit mode. - */ - if (address >= P3SEG && address < P3_ADDR_MAX) { - pgd = pgd_offset_k(address); - mm = NULL; - } else { - if (unlikely(address >= TASK_SIZE || !mm)) - return 1; - - pgd = pgd_offset(mm, address); - } - - pud = pud_offset(pgd, address); - if (pud_none_or_clear_bad(pud)) - return 1; - pmd = pmd_offset(pud, address); - if (pmd_none_or_clear_bad(pmd)) - return 1; - - if (mm) - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - else - pte = pte_offset_kernel(pmd, address); - - entry = *pte; - if (unlikely(pte_none(entry) || pte_not_present(entry))) - goto unlock; - if (unlikely(writeaccess && !pte_write(entry))) - goto unlock; - - if (writeaccess) - entry = pte_mkdirty(entry); - entry = pte_mkyoung(entry); - -#ifdef CONFIG_CPU_SH4 - /* - * ITLB is not affected by "ldtlb" instruction. - * So, we need to flush the entry by ourselves. - */ - __flush_tlb_page(get_asid(), address & PAGE_MASK); -#endif - - set_pte(pte, entry); - update_mmu_cache(NULL, address, entry); - ret = 0; -unlock: - if (mm) - pte_unmap_unlock(pte, ptl); - return ret; -} diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h index b1f21e7656..fa62524505 100644 --- a/include/asm-sh/pgtable.h +++ b/include/asm-sh/pgtable.h @@ -43,12 +43,12 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; /* PGD bits */ #define PGDIR_SHIFT (PTE_SHIFT + PTE_BITS) #define PGDIR_BITS (32 - PGDIR_SHIFT) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_SIZE (1 << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) /* Entries per level */ -#define PTRS_PER_PTE (1UL << PTE_BITS) -#define PTRS_PER_PGD (1UL << PGDIR_BITS) +#define PTRS_PER_PTE (1 << PTE_BITS) +#define PTRS_PER_PGD (1 << PGDIR_BITS) #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) #define FIRST_USER_ADDRESS 0