#include <asm/arch/vmalloc.h>
/*
- * We pull a couple of tricks here:
- * 1. We wrap the PMD into the PGD.
- * 2. We lie about the size of the PTE and PGD.
- * Even though we have 256 PTE entries and 4096 PGD entries, we tell
- * Linux that we actually have 512 PTE entries and 2048 PGD entries.
- * Each "Linux" PGD entry is made up of two hardware PGD entries, and
- * each PTE table is actually two hardware PTE tables.
+ * Hardware-wise, we have a two level page table structure, where the first
+ * level has 4096 entries, and the second level has 256 entries. Each entry
+ * is one 32-bit word. Most of the bits in the second level entry are used
+ * by hardware, and there aren't any "accessed" and "dirty" bits.
+ *
+ * Linux on the other hand has a three level page table structure, which can
+ * be wrapped to fit a two level page table structure easily - using the PGD
+ * and PTE only. However, Linux also expects one "PTE" table per page, and
+ * at least a "dirty" bit.
+ *
+ * Therefore, we tweak the implementation slightly - we tell Linux that we
+ * have 2048 entries in the first level, each of which is 8 bytes (iow, two
+ * hardware pointers to the second level.) The second level contains two
+ * hardware PTE tables arranged contiguously, followed by Linux versions
+ * which contain the state information Linux needs. We, therefore, end up
+ * with 512 entries in the "PTE" level.
+ *
+ * This leads to the page tables having the following layout:
+ *
+ * pgd pte
+ * | |
+ * +--------+ +0
+ * | |-----> +------------+ +0
+ * +- - - - + +4 | h/w pt 0 |
+ * | |-----> +------------+ +1024
+ * +--------+ +8 | h/w pt 1 |
+ * | | +------------+ +2048
+ * +- - - - + | Linux pt 0 |
+ * | | +------------+ +3072
+ * +--------+ | Linux pt 1 |
+ * | | +------------+ +4096
+ *
+ * See L_PTE_xxx below for definitions of bits in the "Linux pt", and
+ * PTE_xxx for definitions of bits appearing in the "h/w pt".
+ *
+ * PMD_xxx definitions refer to bits in the first level page table.
+ *
+ * The "dirty" bit is emulated by only granting hardware write permission
+ * iff the page is marked "writable" and "dirty" in the Linux PTE. This
+ * means that a write to a clean page will cause a permission fault, and
+ * the Linux MM layer will mark the page dirty via handle_pte_fault().
+ * For the hardware to notice the permission change, the TLB entry must
+ * be flushed, and ptep_establish() does that for us.
+ *
+ * The "accessed" or "young" bit is emulated by a similar method; we only
+ * allow accesses to the page if the "young" bit is set. Accesses to the
+ * page will cause a fault, and handle_pte_fault() will set the young bit
+ * for us as long as the page is marked present in the corresponding Linux
+ * PTE entry. Again, ptep_establish() will ensure that the TLB is up to
+ * date.
+ *
+ * However, when the "young" bit is cleared, we deny access to the page
+ * by clearing the hardware PTE. Currently Linux does not flush the TLB
+ * for us in this case, which means the TLB will retain the transation
+ * until either the TLB entry is evicted under pressure, or a context
+ * switch which changes the user space mapping occurs.
*/
#define PTRS_PER_PTE 512
#define PTRS_PER_PMD 1