/*- * Copyright (c) 2009-2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); #include <sys/param.h> /* powerof2() */ #include <sys/queue.h> #include <bitstring.h> #include <errno.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <pjdlog.h> #include "activemap.h" #ifndef PJDLOG_ASSERT #include <assert.h> #define PJDLOG_ASSERT(...) assert(__VA_ARGS__) #endif #define ACTIVEMAP_MAGIC 0xac71e4 struct activemap { int am_magic; /* Magic value. */ off_t am_mediasize; /* Media size in bytes. */ uint32_t am_extentsize; /* Extent size in bytes, must be power of 2. */ uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */ int am_nextents; /* Number of extents. */ size_t am_mapsize; /* Bitmap size in bytes. */ uint16_t *am_memtab; /* An array that holds number of pending writes per extent. */ bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */ bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */ size_t am_diskmapsize; /* Map size rounded up to sector size. */ uint64_t am_ndirty; /* Number of dirty regions. */ bitstr_t *am_syncmap; /* Bitmap of extents to sync. */ off_t am_syncoff; /* Next synchronization offset. */ TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that we keep dirty to reduce bitmap updates. */ int am_nkeepdirty; /* Number of am_keepdirty elements. */ int am_nkeepdirty_limit; /* Maximum number of am_keepdirty elements. */ }; struct keepdirty { int kd_extent; TAILQ_ENTRY(keepdirty) kd_next; }; /* * Helper function taken from sys/systm.h to calculate extentshift. */ static uint32_t bitcount32(uint32_t x) { x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1); x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2); x = (x + (x >> 4)) & 0x0f0f0f0f; x = (x + (x >> 8)); x = (x + (x >> 16)) & 0x000000ff; return (x); } static __inline int off2ext(const struct activemap *amp, off_t offset) { int extent; PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize); extent = (offset >> amp->am_extentshift); PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); return (extent); } static __inline off_t ext2off(const struct activemap *amp, int extent) { off_t offset; PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); offset = ((off_t)extent << amp->am_extentshift); PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize); return (offset); } /* * Function calculates number of requests needed to synchronize the given * extent. */ static __inline int ext2reqs(const struct activemap *amp, int ext) { off_t left; if (ext < amp->am_nextents - 1) return (((amp->am_extentsize - 1) / MAXPHYS) + 1); PJDLOG_ASSERT(ext == amp->am_nextents - 1); left = amp->am_mediasize % amp->am_extentsize; if (left == 0) left = amp->am_extentsize; return (((left - 1) / MAXPHYS) + 1); } /* * Initialize activemap structure and allocate memory for internal needs. * Function returns 0 on success and -1 if any of the allocations failed. */ int activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty) { struct activemap *amp; PJDLOG_ASSERT(ampp != NULL); PJDLOG_ASSERT(mediasize > 0); PJDLOG_ASSERT(extentsize > 0); PJDLOG_ASSERT(powerof2(extentsize)); PJDLOG_ASSERT(sectorsize > 0); PJDLOG_ASSERT(powerof2(sectorsize)); PJDLOG_ASSERT(keepdirty > 0); amp = malloc(sizeof(*amp)); if (amp == NULL) return (-1); amp->am_mediasize = mediasize; amp->am_nkeepdirty_limit = keepdirty; amp->am_extentsize = extentsize; amp->am_extentshift = bitcount32(extentsize - 1); amp->am_nextents = ((mediasize - 1) / extentsize) + 1; amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents); amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize); amp->am_ndirty = 0; amp->am_syncoff = -2; TAILQ_INIT(&->am_keepdirty); amp->am_nkeepdirty = 0; amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0])); amp->am_diskmap = calloc(1, amp->am_diskmapsize); amp->am_memmap = bit_alloc(amp->am_nextents); amp->am_syncmap = bit_alloc(amp->am_nextents); /* * Check to see if any of the allocations above failed. */ if (amp->am_memtab == NULL || amp->am_diskmap == NULL || amp->am_memmap == NULL || amp->am_syncmap == NULL) { if (amp->am_memtab != NULL) free(amp->am_memtab); if (amp->am_diskmap != NULL) free(amp->am_diskmap); if (amp->am_memmap != NULL) free(amp->am_memmap); if (amp->am_syncmap != NULL) free(amp->am_syncmap); amp->am_magic = 0; free(amp); errno = ENOMEM; return (-1); } amp->am_magic = ACTIVEMAP_MAGIC; *ampp = amp; return (0); } static struct keepdirty * keepdirty_find(struct activemap *amp, int extent) { struct keepdirty *kd; TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) { if (kd->kd_extent == extent) break; } return (kd); } static bool keepdirty_add(struct activemap *amp, int extent) { struct keepdirty *kd; kd = keepdirty_find(amp, extent); if (kd != NULL) { /* * Only move element at the beginning. */ TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); return (false); } /* * Add new element, but first remove the most unused one if * we have too many. */ if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) { kd = TAILQ_LAST(&->am_keepdirty, skeepdirty); PJDLOG_ASSERT(kd != NULL); TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); amp->am_nkeepdirty--; PJDLOG_ASSERT(amp->am_nkeepdirty > 0); } if (kd == NULL) kd = malloc(sizeof(*kd)); /* We can ignore allocation failure. */ if (kd != NULL) { kd->kd_extent = extent; amp->am_nkeepdirty++; TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); } return (true); } static void keepdirty_fill(struct activemap *amp) { struct keepdirty *kd; TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) bit_set(amp->am_diskmap, kd->kd_extent); } static void keepdirty_free(struct activemap *amp) { struct keepdirty *kd; while ((kd = TAILQ_FIRST(&->am_keepdirty)) != NULL) { TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); amp->am_nkeepdirty--; free(kd); } PJDLOG_ASSERT(amp->am_nkeepdirty == 0); } /* * Function frees resources allocated by activemap_init() function. */ void activemap_free(struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); amp->am_magic = 0; keepdirty_free(amp); free(amp->am_memtab); free(amp->am_diskmap); free(amp->am_memmap); free(amp->am_syncmap); } /* * Function should be called before we handle write requests. It updates * internal structures and returns true if on-disk metadata should be updated. */ bool activemap_write_start(struct activemap *amp, off_t offset, off_t length) { bool modified; off_t end; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(length > 0); modified = false; end = offset + length - 1; for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { /* * If the number of pending writes is increased from 0, * we have to mark the extent as dirty also in on-disk bitmap. * By returning true we inform the caller that on-disk bitmap * was modified and has to be flushed to disk. */ if (amp->am_memtab[ext]++ == 0) { PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext)); bit_set(amp->am_memmap, ext); amp->am_ndirty++; } if (keepdirty_add(amp, ext)) modified = true; } return (modified); } /* * Function should be called after receiving write confirmation. It updates * internal structures and returns true if on-disk metadata should be updated. */ bool activemap_write_complete(struct activemap *amp, off_t offset, off_t length) { bool modified; off_t end; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(length > 0); modified = false; end = offset + length - 1; for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { /* * If the number of pending writes goes down to 0, we have to * mark the extent as clean also in on-disk bitmap. * By returning true we inform the caller that on-disk bitmap * was modified and has to be flushed to disk. */ PJDLOG_ASSERT(amp->am_memtab[ext] > 0); PJDLOG_ASSERT(bit_test(amp->am_memmap, ext)); if (--amp->am_memtab[ext] == 0) { bit_clear(amp->am_memmap, ext); amp->am_ndirty--; if (keepdirty_find(amp, ext) == NULL) modified = true; } } return (modified); } /* * Function should be called after finishing synchronization of one extent. * It returns true if on-disk metadata should be updated. */ bool activemap_extent_complete(struct activemap *amp, int extent) { bool modified; int reqs; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); modified = false; reqs = ext2reqs(amp, extent); PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs); amp->am_memtab[extent] -= reqs; PJDLOG_ASSERT(bit_test(amp->am_memmap, extent)); if (amp->am_memtab[extent] == 0) { bit_clear(amp->am_memmap, extent); amp->am_ndirty--; modified = true; } return (modified); } /* * Function returns number of dirty regions. */ uint64_t activemap_ndirty(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (amp->am_ndirty); } /* * Function compare on-disk bitmap and in-memory bitmap and returns true if * they differ and should be flushed to the disk. */ bool activemap_differ(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (memcmp(amp->am_diskmap, amp->am_memmap, amp->am_mapsize) != 0); } /* * Function returns number of bytes used by bitmap. */ size_t activemap_size(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (amp->am_mapsize); } /* * Function returns number of bytes needed for storing on-disk bitmap. * This is the same as activemap_size(), but rounded up to sector size. */ size_t activemap_ondisk_size(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (amp->am_diskmapsize); } /* * Function copies the given buffer read from disk to the internal bitmap. */ void activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size) { int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(size >= amp->am_mapsize); memcpy(amp->am_diskmap, buf, amp->am_mapsize); memcpy(amp->am_memmap, buf, amp->am_mapsize); memcpy(amp->am_syncmap, buf, amp->am_mapsize); bit_ffs(amp->am_memmap, amp->am_nextents, &ext); if (ext == -1) { /* There are no dirty extents, so we can leave now. */ return; } /* * Set synchronization offset to the first dirty extent. */ activemap_sync_rewind(amp); /* * We have dirty extents and we want them to stay that way until * we synchronize, so we set number of pending writes to number * of requests needed to synchronize one extent. */ amp->am_ndirty = 0; for (; ext < amp->am_nextents; ext++) { if (bit_test(amp->am_memmap, ext)) { amp->am_memtab[ext] = ext2reqs(amp, ext); amp->am_ndirty++; } } } /* * Function merges the given bitmap with existing one. */ void activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size) { bitstr_t *remmap = __DECONST(bitstr_t *, buf); int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(size >= amp->am_mapsize); bit_ffs(remmap, amp->am_nextents, &ext); if (ext == -1) { /* There are no dirty extents, so we can leave now. */ return; } /* * We have dirty extents and we want them to stay that way until * we synchronize, so we set number of pending writes to number * of requests needed to synchronize one extent. */ for (; ext < amp->am_nextents; ext++) { /* Local extent already dirty. */ if (bit_test(amp->am_syncmap, ext)) continue; /* Remote extent isn't dirty. */ if (!bit_test(remmap, ext)) continue; bit_set(amp->am_syncmap, ext); bit_set(amp->am_memmap, ext); bit_set(amp->am_diskmap, ext); if (amp->am_memtab[ext] == 0) amp->am_ndirty++; amp->am_memtab[ext] = ext2reqs(amp, ext); } /* * Set synchronization offset to the first dirty extent. */ activemap_sync_rewind(amp); } /* * Function returns pointer to internal bitmap that should be written to disk. */ const unsigned char * activemap_bitmap(struct activemap *amp, size_t *sizep) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); if (sizep != NULL) *sizep = amp->am_diskmapsize; memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize); keepdirty_fill(amp); return ((const unsigned char *)amp->am_diskmap); } /* * Function calculates size needed to store bitmap on disk. */ size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize, uint32_t sectorsize) { uint64_t nextents, mapsize; PJDLOG_ASSERT(mediasize > 0); PJDLOG_ASSERT(extentsize > 0); PJDLOG_ASSERT(powerof2(extentsize)); PJDLOG_ASSERT(sectorsize > 0); PJDLOG_ASSERT(powerof2(sectorsize)); nextents = ((mediasize - 1) / extentsize) + 1; mapsize = sizeof(bitstr_t) * bitstr_size(nextents); return (roundup2(mapsize, sectorsize)); } /* * Set synchronization offset to the first dirty extent. */ void activemap_sync_rewind(struct activemap *amp) { int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); if (ext == -1) { /* There are no extents to synchronize. */ amp->am_syncoff = -2; return; } /* * Mark that we want to start synchronization from the beginning. */ amp->am_syncoff = -1; } /* * Return next offset of where we should synchronize. */ off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp) { off_t syncoff, left; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(lengthp != NULL); PJDLOG_ASSERT(syncextp != NULL); *syncextp = -1; if (amp->am_syncoff == -2) return (-1); if (amp->am_syncoff >= 0 && (amp->am_syncoff + MAXPHYS >= amp->am_mediasize || off2ext(amp, amp->am_syncoff) != off2ext(amp, amp->am_syncoff + MAXPHYS))) { /* * We are about to change extent, so mark previous one as clean. */ ext = off2ext(amp, amp->am_syncoff); bit_clear(amp->am_syncmap, ext); *syncextp = ext; amp->am_syncoff = -1; } if (amp->am_syncoff == -1) { /* * Let's find first extent to synchronize. */ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); if (ext == -1) { amp->am_syncoff = -2; return (-1); } amp->am_syncoff = ext2off(amp, ext); } else { /* * We don't change extent, so just increase offset. */ amp->am_syncoff += MAXPHYS; if (amp->am_syncoff >= amp->am_mediasize) { amp->am_syncoff = -2; return (-1); } } syncoff = amp->am_syncoff; left = ext2off(amp, off2ext(amp, syncoff)) + amp->am_extentsize - syncoff; if (syncoff + left > amp->am_mediasize) left = amp->am_mediasize - syncoff; if (left > MAXPHYS) left = MAXPHYS; PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS); PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize); PJDLOG_ASSERT(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize); *lengthp = left; return (syncoff); } /* * Mark extent(s) containing the given region for synchronization. * Most likely one of the components is unavailable. */ bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length) { bool modified; off_t end; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); modified = false; end = offset + length - 1; for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { if (bit_test(amp->am_syncmap, ext)) { /* Already marked for synchronization. */ PJDLOG_ASSERT(bit_test(amp->am_memmap, ext)); continue; } bit_set(amp->am_syncmap, ext); if (!bit_test(amp->am_memmap, ext)) { bit_set(amp->am_memmap, ext); amp->am_ndirty++; } amp->am_memtab[ext] += ext2reqs(amp, ext); modified = true; } return (modified); } void activemap_dump(const struct activemap *amp) { int bit; printf("M: "); for (bit = 0; bit < amp->am_nextents; bit++) printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0); printf("\n"); printf("D: "); for (bit = 0; bit < amp->am_nextents; bit++) printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0); printf("\n"); printf("S: "); for (bit = 0; bit < amp->am_nextents; bit++) printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0); printf("\n"); }