mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-10 15:19:51 +00:00
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "We have a pile of bug fixes from Ilya, including a few patches that sync up the CRUSH code with the latest from userspace. There is also a long series from Zheng that fixes various issues with snapshots, inline data, and directory fsync, some simplification and improvement in the cap release code, and a rework of the caching of directory contents. To top it off there are a few small fixes and cleanups from Benoit and Hong" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (40 commits) rbd: use GFP_NOIO in rbd_obj_request_create() crush: fix a bug in tree bucket decode libceph: Fix ceph_tcp_sendpage()'s more boolean usage libceph: Remove spurious kunmap() of the zero page rbd: queue_depth map option rbd: store rbd_options in rbd_device rbd: terminate rbd_opts_tokens with Opt_err ceph: fix ceph_writepages_start() rbd: bump queue_max_segments ceph: rework dcache readdir crush: sync up with userspace crush: fix crash from invalid 'take' argument ceph: switch some GFP_NOFS memory allocation to GFP_KERNEL ceph: pre-allocate data structure that tracks caps flushing ceph: re-send flushing caps (which are revoked) in reconnect stage ceph: send TID of the oldest pending caps flush to MDS ceph: track pending caps flushing globally ceph: track pending caps flushing accurately libceph: fix wrong name "Ceph filesystem for Linux" ceph: fix directory fsync ...
This commit is contained in:
commit
0c76c6ba24
@ -346,6 +346,7 @@ struct rbd_device {
|
||||
struct rbd_image_header header;
|
||||
unsigned long flags; /* possibly lock protected */
|
||||
struct rbd_spec *spec;
|
||||
struct rbd_options *opts;
|
||||
|
||||
char *header_name;
|
||||
|
||||
@ -724,34 +725,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
|
||||
}
|
||||
|
||||
/*
|
||||
* mount options
|
||||
* (Per device) rbd map options
|
||||
*/
|
||||
enum {
|
||||
Opt_queue_depth,
|
||||
Opt_last_int,
|
||||
/* int args above */
|
||||
Opt_last_string,
|
||||
/* string args above */
|
||||
Opt_read_only,
|
||||
Opt_read_write,
|
||||
/* Boolean args above */
|
||||
Opt_last_bool,
|
||||
Opt_err
|
||||
};
|
||||
|
||||
static match_table_t rbd_opts_tokens = {
|
||||
{Opt_queue_depth, "queue_depth=%d"},
|
||||
/* int args above */
|
||||
/* string args above */
|
||||
{Opt_read_only, "read_only"},
|
||||
{Opt_read_only, "ro"}, /* Alternate spelling */
|
||||
{Opt_read_write, "read_write"},
|
||||
{Opt_read_write, "rw"}, /* Alternate spelling */
|
||||
/* Boolean args above */
|
||||
{-1, NULL}
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
|
||||
struct rbd_options {
|
||||
int queue_depth;
|
||||
bool read_only;
|
||||
};
|
||||
|
||||
#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
|
||||
#define RBD_READ_ONLY_DEFAULT false
|
||||
|
||||
static int parse_rbd_opts_token(char *c, void *private)
|
||||
@ -761,27 +764,27 @@ static int parse_rbd_opts_token(char *c, void *private)
|
||||
int token, intval, ret;
|
||||
|
||||
token = match_token(c, rbd_opts_tokens, argstr);
|
||||
if (token < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (token < Opt_last_int) {
|
||||
ret = match_int(&argstr[0], &intval);
|
||||
if (ret < 0) {
|
||||
pr_err("bad mount option arg (not int) "
|
||||
"at '%s'\n", c);
|
||||
pr_err("bad mount option arg (not int) at '%s'\n", c);
|
||||
return ret;
|
||||
}
|
||||
dout("got int token %d val %d\n", token, intval);
|
||||
} else if (token > Opt_last_int && token < Opt_last_string) {
|
||||
dout("got string token %d val %s\n", token,
|
||||
argstr[0].from);
|
||||
} else if (token > Opt_last_string && token < Opt_last_bool) {
|
||||
dout("got Boolean token %d\n", token);
|
||||
dout("got string token %d val %s\n", token, argstr[0].from);
|
||||
} else {
|
||||
dout("got token %d\n", token);
|
||||
}
|
||||
|
||||
switch (token) {
|
||||
case Opt_queue_depth:
|
||||
if (intval < 1) {
|
||||
pr_err("queue_depth out of range\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
rbd_opts->queue_depth = intval;
|
||||
break;
|
||||
case Opt_read_only:
|
||||
rbd_opts->read_only = true;
|
||||
break;
|
||||
@ -789,9 +792,10 @@ static int parse_rbd_opts_token(char *c, void *private)
|
||||
rbd_opts->read_only = false;
|
||||
break;
|
||||
default:
|
||||
rbd_assert(false);
|
||||
break;
|
||||
/* libceph prints "bad option" msg */
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1563,22 +1567,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
|
||||
/*
|
||||
* Wait for an object request to complete. If interrupted, cancel the
|
||||
* underlying osd request.
|
||||
*
|
||||
* @timeout: in jiffies, 0 means "wait forever"
|
||||
*/
|
||||
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
|
||||
static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request,
|
||||
unsigned long timeout)
|
||||
{
|
||||
int ret;
|
||||
long ret;
|
||||
|
||||
dout("%s %p\n", __func__, obj_request);
|
||||
|
||||
ret = wait_for_completion_interruptible(&obj_request->completion);
|
||||
if (ret < 0) {
|
||||
dout("%s %p interrupted\n", __func__, obj_request);
|
||||
ret = wait_for_completion_interruptible_timeout(
|
||||
&obj_request->completion,
|
||||
ceph_timeout_jiffies(timeout));
|
||||
if (ret <= 0) {
|
||||
if (ret == 0)
|
||||
ret = -ETIMEDOUT;
|
||||
rbd_obj_request_end(obj_request);
|
||||
return ret;
|
||||
} else {
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
dout("%s %p done\n", __func__, obj_request);
|
||||
return 0;
|
||||
dout("%s %p ret %d\n", __func__, obj_request, (int)ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
|
||||
{
|
||||
return __rbd_obj_request_wait(obj_request, 0);
|
||||
}
|
||||
|
||||
static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request,
|
||||
unsigned long timeout)
|
||||
{
|
||||
return __rbd_obj_request_wait(obj_request, timeout);
|
||||
}
|
||||
|
||||
static void rbd_img_request_complete(struct rbd_img_request *img_request)
|
||||
@ -2001,11 +2022,11 @@ static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
|
||||
rbd_assert(obj_request_type_valid(type));
|
||||
|
||||
size = strlen(object_name) + 1;
|
||||
name = kmalloc(size, GFP_KERNEL);
|
||||
name = kmalloc(size, GFP_NOIO);
|
||||
if (!name)
|
||||
return NULL;
|
||||
|
||||
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
|
||||
obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
|
||||
if (!obj_request) {
|
||||
kfree(name);
|
||||
return NULL;
|
||||
@ -2376,7 +2397,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
|
||||
}
|
||||
|
||||
if (opcode == CEPH_OSD_OP_DELETE)
|
||||
osd_req_op_init(osd_request, num_ops, opcode);
|
||||
osd_req_op_init(osd_request, num_ops, opcode, 0);
|
||||
else
|
||||
osd_req_op_extent_init(osd_request, num_ops, opcode,
|
||||
offset, length, 0, 0);
|
||||
@ -2848,7 +2869,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
|
||||
goto out;
|
||||
stat_request->callback = rbd_img_obj_exists_callback;
|
||||
|
||||
osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
|
||||
osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
|
||||
osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
|
||||
false, false);
|
||||
rbd_osd_req_format_read(stat_request);
|
||||
@ -3122,6 +3143,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
|
||||
bool watch)
|
||||
{
|
||||
struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
|
||||
struct ceph_options *opts = osdc->client->options;
|
||||
struct rbd_obj_request *obj_request;
|
||||
int ret;
|
||||
|
||||
@ -3148,7 +3170,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = rbd_obj_request_wait(obj_request);
|
||||
ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@ -3750,10 +3772,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
|
||||
|
||||
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
|
||||
rbd_dev->tag_set.ops = &rbd_mq_ops;
|
||||
rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
|
||||
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
|
||||
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
|
||||
rbd_dev->tag_set.flags =
|
||||
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
|
||||
rbd_dev->tag_set.nr_hw_queues = 1;
|
||||
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
|
||||
|
||||
@ -3773,6 +3794,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
|
||||
/* set io sizes to object size */
|
||||
segment_size = rbd_obj_bytes(&rbd_dev->header);
|
||||
blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
|
||||
blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
|
||||
blk_queue_max_segment_size(q, segment_size);
|
||||
blk_queue_io_min(q, segment_size);
|
||||
blk_queue_io_opt(q, segment_size);
|
||||
@ -4044,7 +4066,8 @@ static void rbd_spec_free(struct kref *kref)
|
||||
}
|
||||
|
||||
static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
|
||||
struct rbd_spec *spec)
|
||||
struct rbd_spec *spec,
|
||||
struct rbd_options *opts)
|
||||
{
|
||||
struct rbd_device *rbd_dev;
|
||||
|
||||
@ -4058,8 +4081,9 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
|
||||
INIT_LIST_HEAD(&rbd_dev->node);
|
||||
init_rwsem(&rbd_dev->header_rwsem);
|
||||
|
||||
rbd_dev->spec = spec;
|
||||
rbd_dev->rbd_client = rbdc;
|
||||
rbd_dev->spec = spec;
|
||||
rbd_dev->opts = opts;
|
||||
|
||||
/* Initialize the layout used for all rbd requests */
|
||||
|
||||
@ -4075,6 +4099,7 @@ static void rbd_dev_destroy(struct rbd_device *rbd_dev)
|
||||
{
|
||||
rbd_put_client(rbd_dev->rbd_client);
|
||||
rbd_spec_put(rbd_dev->spec);
|
||||
kfree(rbd_dev->opts);
|
||||
kfree(rbd_dev);
|
||||
}
|
||||
|
||||
@ -4933,6 +4958,7 @@ static int rbd_add_parse_args(const char *buf,
|
||||
goto out_mem;
|
||||
|
||||
rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
|
||||
rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
|
||||
|
||||
copts = ceph_parse_options(options, mon_addrs,
|
||||
mon_addrs + mon_addrs_size - 1,
|
||||
@ -4963,8 +4989,8 @@ out_err:
|
||||
*/
|
||||
static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
|
||||
{
|
||||
struct ceph_options *opts = rbdc->client->options;
|
||||
u64 newest_epoch;
|
||||
unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
|
||||
int tries = 0;
|
||||
int ret;
|
||||
|
||||
@ -4979,7 +5005,8 @@ again:
|
||||
if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
|
||||
ceph_monc_request_next_osdmap(&rbdc->client->monc);
|
||||
(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
|
||||
newest_epoch, timeout);
|
||||
newest_epoch,
|
||||
opts->mount_timeout);
|
||||
goto again;
|
||||
} else {
|
||||
/* the osdmap we have is new enough */
|
||||
@ -5148,7 +5175,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
|
||||
rbdc = __rbd_get_client(rbd_dev->rbd_client);
|
||||
|
||||
ret = -ENOMEM;
|
||||
parent = rbd_dev_create(rbdc, parent_spec);
|
||||
parent = rbd_dev_create(rbdc, parent_spec, NULL);
|
||||
if (!parent)
|
||||
goto out_err;
|
||||
|
||||
@ -5394,9 +5421,6 @@ static ssize_t do_rbd_add(struct bus_type *bus,
|
||||
rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
|
||||
if (rc < 0)
|
||||
goto err_out_module;
|
||||
read_only = rbd_opts->read_only;
|
||||
kfree(rbd_opts);
|
||||
rbd_opts = NULL; /* done with this */
|
||||
|
||||
rbdc = rbd_get_client(ceph_opts);
|
||||
if (IS_ERR(rbdc)) {
|
||||
@ -5422,11 +5446,12 @@ static ssize_t do_rbd_add(struct bus_type *bus,
|
||||
goto err_out_client;
|
||||
}
|
||||
|
||||
rbd_dev = rbd_dev_create(rbdc, spec);
|
||||
rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
|
||||
if (!rbd_dev)
|
||||
goto err_out_client;
|
||||
rbdc = NULL; /* rbd_dev now owns this */
|
||||
spec = NULL; /* rbd_dev now owns this */
|
||||
rbd_opts = NULL; /* rbd_dev now owns this */
|
||||
|
||||
rc = rbd_dev_image_probe(rbd_dev, true);
|
||||
if (rc < 0)
|
||||
@ -5434,6 +5459,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
|
||||
|
||||
/* If we are mapping a snapshot it must be marked read-only */
|
||||
|
||||
read_only = rbd_dev->opts->read_only;
|
||||
if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
|
||||
read_only = true;
|
||||
rbd_dev->mapping.read_only = read_only;
|
||||
@ -5458,6 +5484,7 @@ err_out_client:
|
||||
rbd_put_client(rbdc);
|
||||
err_out_args:
|
||||
rbd_spec_put(spec);
|
||||
kfree(rbd_opts);
|
||||
err_out_module:
|
||||
module_put(THIS_MODULE);
|
||||
|
||||
|
@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
|
||||
val_size2 = posix_acl_xattr_size(default_acl->a_count);
|
||||
|
||||
err = -ENOMEM;
|
||||
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
|
||||
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
|
||||
if (!tmp_buf)
|
||||
goto out_err;
|
||||
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
|
||||
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
|
||||
if (!pagelist)
|
||||
goto out_err;
|
||||
ceph_pagelist_init(pagelist);
|
||||
|
308
fs/ceph/addr.c
308
fs/ceph/addr.c
@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page)
|
||||
inode = mapping->host;
|
||||
ci = ceph_inode(inode);
|
||||
|
||||
/*
|
||||
* Note that we're grabbing a snapc ref here without holding
|
||||
* any locks!
|
||||
*/
|
||||
snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
|
||||
|
||||
/* dirty the head */
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (ci->i_head_snapc == NULL)
|
||||
ci->i_head_snapc = ceph_get_snap_context(snapc);
|
||||
++ci->i_wrbuffer_ref_head;
|
||||
BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
|
||||
if (__ceph_have_pending_cap_snap(ci)) {
|
||||
struct ceph_cap_snap *capsnap =
|
||||
list_last_entry(&ci->i_cap_snaps,
|
||||
struct ceph_cap_snap,
|
||||
ci_item);
|
||||
snapc = ceph_get_snap_context(capsnap->context);
|
||||
capsnap->dirty_pages++;
|
||||
} else {
|
||||
BUG_ON(!ci->i_head_snapc);
|
||||
snapc = ceph_get_snap_context(ci->i_head_snapc);
|
||||
++ci->i_wrbuffer_ref_head;
|
||||
}
|
||||
if (ci->i_wrbuffer_ref == 0)
|
||||
ihold(inode);
|
||||
++ci->i_wrbuffer_ref;
|
||||
@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
|
||||
|
||||
/* build page vector */
|
||||
nr_pages = calc_pages_for(0, len);
|
||||
pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
|
||||
pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
|
||||
ret = -ENOMEM;
|
||||
if (!pages)
|
||||
goto out;
|
||||
@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
|
||||
dout("start_read %p adding %p idx %lu\n", inode, page,
|
||||
page->index);
|
||||
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
|
||||
GFP_NOFS)) {
|
||||
GFP_KERNEL)) {
|
||||
ceph_fscache_uncache_page(inode, page);
|
||||
page_cache_release(page);
|
||||
dout("start_read %p add_to_page_cache failed %p\n",
|
||||
@ -436,7 +440,7 @@ out:
|
||||
* only snap context we are allowed to write back.
|
||||
*/
|
||||
static struct ceph_snap_context *get_oldest_context(struct inode *inode,
|
||||
u64 *snap_size)
|
||||
loff_t *snap_size)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_snap_context *snapc = NULL;
|
||||
@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
||||
struct ceph_osd_client *osdc;
|
||||
struct ceph_snap_context *snapc, *oldest;
|
||||
loff_t page_off = page_offset(page);
|
||||
loff_t snap_size = -1;
|
||||
long writeback_stat;
|
||||
u64 truncate_size, snap_size = 0;
|
||||
u64 truncate_size;
|
||||
u32 truncate_seq;
|
||||
int err = 0, len = PAGE_CACHE_SIZE;
|
||||
|
||||
@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
truncate_seq = ci->i_truncate_seq;
|
||||
truncate_size = ci->i_truncate_size;
|
||||
if (!snap_size)
|
||||
if (snap_size == -1)
|
||||
snap_size = i_size_read(inode);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping,
|
||||
unsigned wsize = 1 << inode->i_blkbits;
|
||||
struct ceph_osd_request *req = NULL;
|
||||
int do_sync = 0;
|
||||
u64 truncate_size, snap_size;
|
||||
loff_t snap_size, i_size;
|
||||
u64 truncate_size;
|
||||
u32 truncate_seq;
|
||||
|
||||
/*
|
||||
@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping,
|
||||
retry:
|
||||
/* find oldest snap context with dirty data */
|
||||
ceph_put_snap_context(snapc);
|
||||
snap_size = 0;
|
||||
snap_size = -1;
|
||||
snapc = get_oldest_context(inode, &snap_size);
|
||||
if (!snapc) {
|
||||
/* hmm, why does writepages get called when there
|
||||
@ -749,16 +755,13 @@ retry:
|
||||
dout(" no snap context with dirty data?\n");
|
||||
goto out;
|
||||
}
|
||||
if (snap_size == 0)
|
||||
snap_size = i_size_read(inode);
|
||||
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
|
||||
snapc, snapc->seq, snapc->num_snaps);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
truncate_seq = ci->i_truncate_seq;
|
||||
truncate_size = ci->i_truncate_size;
|
||||
if (!snap_size)
|
||||
snap_size = i_size_read(inode);
|
||||
i_size = i_size_read(inode);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
if (last_snapc && snapc != last_snapc) {
|
||||
@ -828,8 +831,10 @@ get_more_pages:
|
||||
dout("waiting on writeback %p\n", page);
|
||||
wait_on_page_writeback(page);
|
||||
}
|
||||
if (page_offset(page) >= snap_size) {
|
||||
dout("%p page eof %llu\n", page, snap_size);
|
||||
if (page_offset(page) >=
|
||||
(snap_size == -1 ? i_size : snap_size)) {
|
||||
dout("%p page eof %llu\n", page,
|
||||
(snap_size == -1 ? i_size : snap_size));
|
||||
done = 1;
|
||||
unlock_page(page);
|
||||
break;
|
||||
@ -884,7 +889,8 @@ get_more_pages:
|
||||
}
|
||||
|
||||
if (do_sync)
|
||||
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
|
||||
osd_req_op_init(req, 1,
|
||||
CEPH_OSD_OP_STARTSYNC, 0);
|
||||
|
||||
req->r_callback = writepages_finish;
|
||||
req->r_inode = inode;
|
||||
@ -944,10 +950,18 @@ get_more_pages:
|
||||
}
|
||||
|
||||
/* Format the osd request message and submit the write */
|
||||
|
||||
offset = page_offset(pages[0]);
|
||||
len = min(snap_size - offset,
|
||||
(u64)locked_pages << PAGE_CACHE_SHIFT);
|
||||
len = (u64)locked_pages << PAGE_CACHE_SHIFT;
|
||||
if (snap_size == -1) {
|
||||
len = min(len, (u64)i_size_read(inode) - offset);
|
||||
/* writepages_finish() clears writeback pages
|
||||
* according to the data length, so make sure
|
||||
* data length covers all locked pages */
|
||||
len = max(len, 1 +
|
||||
((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT));
|
||||
} else {
|
||||
len = min(len, snap_size - offset);
|
||||
}
|
||||
dout("writepages got %d pages at %llu~%llu\n",
|
||||
locked_pages, offset, len);
|
||||
|
||||
@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file,
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
loff_t page_off = pos & PAGE_CACHE_MASK;
|
||||
int pos_in_page = pos & ~PAGE_CACHE_MASK;
|
||||
int end_in_page = pos_in_page + len;
|
||||
@ -1044,10 +1057,6 @@ retry_locked:
|
||||
/* writepages currently holds page lock, but if we change that later, */
|
||||
wait_on_page_writeback(page);
|
||||
|
||||
/* check snap context */
|
||||
BUG_ON(!ci->i_snap_realm);
|
||||
down_read(&mdsc->snap_rwsem);
|
||||
BUG_ON(!ci->i_snap_realm->cached_context);
|
||||
snapc = page_snap_context(page);
|
||||
if (snapc && snapc != ci->i_head_snapc) {
|
||||
/*
|
||||
@ -1055,7 +1064,6 @@ retry_locked:
|
||||
* context! is it writeable now?
|
||||
*/
|
||||
oldest = get_oldest_context(inode, NULL);
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
|
||||
if (snapc->seq > oldest->seq) {
|
||||
ceph_put_snap_context(oldest);
|
||||
@ -1112,7 +1120,6 @@ retry_locked:
|
||||
}
|
||||
|
||||
/* we need to read it. */
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
r = readpage_nounlock(file, page);
|
||||
if (r < 0)
|
||||
goto fail_nosnap;
|
||||
@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
|
||||
|
||||
/*
|
||||
* we don't do anything in here that simple_write_end doesn't do
|
||||
* except adjust dirty page accounting and drop read lock on
|
||||
* mdsc->snap_rwsem.
|
||||
* except adjust dirty page accounting
|
||||
*/
|
||||
static int ceph_write_end(struct file *file, struct address_space *mapping,
|
||||
loff_t pos, unsigned len, unsigned copied,
|
||||
struct page *page, void *fsdata)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
|
||||
int check_cap = 0;
|
||||
|
||||
@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
|
||||
set_page_dirty(page);
|
||||
|
||||
unlock_page(page);
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
page_cache_release(page);
|
||||
|
||||
if (check_cap)
|
||||
@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
struct inode *inode = file_inode(vma->vm_file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_file_info *fi = vma->vm_file->private_data;
|
||||
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
|
||||
struct ceph_cap_flush *prealloc_cf;
|
||||
struct page *page = vmf->page;
|
||||
loff_t off = page_offset(page);
|
||||
loff_t size = i_size_read(inode);
|
||||
size_t len;
|
||||
int want, got, ret;
|
||||
|
||||
prealloc_cf = ceph_alloc_cap_flush();
|
||||
if (!prealloc_cf)
|
||||
return VM_FAULT_SIGBUS;
|
||||
|
||||
if (ci->i_inline_version != CEPH_INLINE_NONE) {
|
||||
struct page *locked_page = NULL;
|
||||
if (off == 0) {
|
||||
@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
ret = ceph_uninline_data(vma->vm_file, locked_page);
|
||||
if (locked_page)
|
||||
unlock_page(locked_page);
|
||||
if (ret < 0)
|
||||
return VM_FAULT_SIGBUS;
|
||||
if (ret < 0) {
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out_free;
|
||||
}
|
||||
}
|
||||
|
||||
if (off + PAGE_CACHE_SIZE <= size)
|
||||
@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
break;
|
||||
if (ret != -ERESTARTSYS) {
|
||||
WARN_ON(1);
|
||||
return VM_FAULT_SIGBUS;
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto out_free;
|
||||
}
|
||||
}
|
||||
dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
|
||||
@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
if (ret == 0) {
|
||||
/* success. we'll keep the page locked. */
|
||||
set_page_dirty(page);
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
ret = VM_FAULT_LOCKED;
|
||||
} else {
|
||||
if (ret == -ENOMEM)
|
||||
@ -1389,7 +1398,8 @@ out:
|
||||
int dirty;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
ci->i_inline_version = CEPH_INLINE_NONE;
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
|
||||
&prealloc_cf);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
if (dirty)
|
||||
__mark_inode_dirty(inode, dirty);
|
||||
@ -1398,6 +1408,8 @@ out:
|
||||
dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
|
||||
inode, off, len, ceph_cap_string(got), ret);
|
||||
ceph_put_cap_refs(ci, got);
|
||||
out_free:
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
|
||||
ceph_vino(inode), 0, &len, 0, 1,
|
||||
CEPH_OSD_OP_CREATE,
|
||||
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
|
||||
ci->i_snap_realm->cached_context,
|
||||
0, 0, false);
|
||||
ceph_empty_snapc, 0, 0, false);
|
||||
if (IS_ERR(req)) {
|
||||
err = PTR_ERR(req);
|
||||
goto out;
|
||||
@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
|
||||
ceph_vino(inode), 0, &len, 1, 3,
|
||||
CEPH_OSD_OP_WRITE,
|
||||
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
|
||||
ci->i_snap_realm->cached_context,
|
||||
ceph_empty_snapc,
|
||||
ci->i_truncate_seq, ci->i_truncate_size,
|
||||
false);
|
||||
if (IS_ERR(req)) {
|
||||
@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
vma->vm_ops = &ceph_vmops;
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum {
|
||||
POOL_READ = 1,
|
||||
POOL_WRITE = 2,
|
||||
};
|
||||
|
||||
static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
|
||||
{
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
|
||||
struct ceph_mds_client *mdsc = fsc->mdsc;
|
||||
struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
|
||||
struct rb_node **p, *parent;
|
||||
struct ceph_pool_perm *perm;
|
||||
struct page **pages;
|
||||
int err = 0, err2 = 0, have = 0;
|
||||
|
||||
down_read(&mdsc->pool_perm_rwsem);
|
||||
p = &mdsc->pool_perm_tree.rb_node;
|
||||
while (*p) {
|
||||
perm = rb_entry(*p, struct ceph_pool_perm, node);
|
||||
if (pool < perm->pool)
|
||||
p = &(*p)->rb_left;
|
||||
else if (pool > perm->pool)
|
||||
p = &(*p)->rb_right;
|
||||
else {
|
||||
have = perm->perm;
|
||||
break;
|
||||
}
|
||||
}
|
||||
up_read(&mdsc->pool_perm_rwsem);
|
||||
if (*p)
|
||||
goto out;
|
||||
|
||||
dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
|
||||
|
||||
down_write(&mdsc->pool_perm_rwsem);
|
||||
parent = NULL;
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
perm = rb_entry(parent, struct ceph_pool_perm, node);
|
||||
if (pool < perm->pool)
|
||||
p = &(*p)->rb_left;
|
||||
else if (pool > perm->pool)
|
||||
p = &(*p)->rb_right;
|
||||
else {
|
||||
have = perm->perm;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (*p) {
|
||||
up_write(&mdsc->pool_perm_rwsem);
|
||||
goto out;
|
||||
}
|
||||
|
||||
rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
|
||||
ceph_empty_snapc,
|
||||
1, false, GFP_NOFS);
|
||||
if (!rd_req) {
|
||||
err = -ENOMEM;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
rd_req->r_flags = CEPH_OSD_FLAG_READ;
|
||||
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
|
||||
rd_req->r_base_oloc.pool = pool;
|
||||
snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
|
||||
"%llx.00000000", ci->i_vino.ino);
|
||||
rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
|
||||
|
||||
wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
|
||||
ceph_empty_snapc,
|
||||
1, false, GFP_NOFS);
|
||||
if (!wr_req) {
|
||||
err = -ENOMEM;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
wr_req->r_flags = CEPH_OSD_FLAG_WRITE |
|
||||
CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
|
||||
osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
|
||||
wr_req->r_base_oloc.pool = pool;
|
||||
wr_req->r_base_oid = rd_req->r_base_oid;
|
||||
|
||||
/* one page should be large enough for STAT data */
|
||||
pages = ceph_alloc_page_vector(1, GFP_KERNEL);
|
||||
if (IS_ERR(pages)) {
|
||||
err = PTR_ERR(pages);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
|
||||
0, false, true);
|
||||
ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP,
|
||||
&ci->vfs_inode.i_mtime);
|
||||
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
|
||||
|
||||
ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP,
|
||||
&ci->vfs_inode.i_mtime);
|
||||
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
|
||||
|
||||
if (!err)
|
||||
err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
|
||||
if (!err2)
|
||||
err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
|
||||
|
||||
if (err >= 0 || err == -ENOENT)
|
||||
have |= POOL_READ;
|
||||
else if (err != -EPERM)
|
||||
goto out_unlock;
|
||||
|
||||
if (err2 == 0 || err2 == -EEXIST)
|
||||
have |= POOL_WRITE;
|
||||
else if (err2 != -EPERM) {
|
||||
err = err2;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
perm = kmalloc(sizeof(*perm), GFP_NOFS);
|
||||
if (!perm) {
|
||||
err = -ENOMEM;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
perm->pool = pool;
|
||||
perm->perm = have;
|
||||
rb_link_node(&perm->node, parent, p);
|
||||
rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
|
||||
err = 0;
|
||||
out_unlock:
|
||||
up_write(&mdsc->pool_perm_rwsem);
|
||||
|
||||
if (rd_req)
|
||||
ceph_osdc_put_request(rd_req);
|
||||
if (wr_req)
|
||||
ceph_osdc_put_request(wr_req);
|
||||
out:
|
||||
if (!err)
|
||||
err = have;
|
||||
dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
|
||||
{
|
||||
u32 pool;
|
||||
int ret, flags;
|
||||
|
||||
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
|
||||
NOPOOLPERM))
|
||||
return 0;
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
flags = ci->i_ceph_flags;
|
||||
pool = ceph_file_layout_pg_pool(ci->i_layout);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
check:
|
||||
if (flags & CEPH_I_POOL_PERM) {
|
||||
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
|
||||
dout("ceph_pool_perm_check pool %u no read perm\n",
|
||||
pool);
|
||||
return -EPERM;
|
||||
}
|
||||
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
|
||||
dout("ceph_pool_perm_check pool %u no write perm\n",
|
||||
pool);
|
||||
return -EPERM;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = __ceph_pool_perm_get(ci, pool);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
flags = CEPH_I_POOL_PERM;
|
||||
if (ret & POOL_READ)
|
||||
flags |= CEPH_I_POOL_RD;
|
||||
if (ret & POOL_WRITE)
|
||||
flags |= CEPH_I_POOL_WR;
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
|
||||
ci->i_ceph_flags = flags;
|
||||
} else {
|
||||
pool = ceph_file_layout_pg_pool(ci->i_layout);
|
||||
flags = ci->i_ceph_flags;
|
||||
}
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
goto check;
|
||||
}
|
||||
|
||||
void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
struct ceph_pool_perm *perm;
|
||||
struct rb_node *n;
|
||||
|
||||
while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
|
||||
n = rb_first(&mdsc->pool_perm_tree);
|
||||
perm = rb_entry(n, struct ceph_pool_perm, node);
|
||||
rb_erase(n, &mdsc->pool_perm_tree);
|
||||
kfree(perm);
|
||||
}
|
||||
}
|
||||
|
836
fs/ceph/caps.c
836
fs/ceph/caps.c
File diff suppressed because it is too large
Load Diff
389
fs/ceph/dir.c
389
fs/ceph/dir.c
@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
|
||||
if (dentry->d_fsdata)
|
||||
return 0;
|
||||
|
||||
di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
|
||||
di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
|
||||
if (!di)
|
||||
return -ENOMEM; /* oh well */
|
||||
|
||||
@ -106,6 +106,27 @@ static int fpos_cmp(loff_t l, loff_t r)
|
||||
return (int)(fpos_off(l) - fpos_off(r));
|
||||
}
|
||||
|
||||
/*
|
||||
* make note of the last dentry we read, so we can
|
||||
* continue at the same lexicographical point,
|
||||
* regardless of what dir changes take place on the
|
||||
* server.
|
||||
*/
|
||||
static int note_last_dentry(struct ceph_file_info *fi, const char *name,
|
||||
int len, unsigned next_offset)
|
||||
{
|
||||
char *buf = kmalloc(len+1, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
kfree(fi->last_name);
|
||||
fi->last_name = buf;
|
||||
memcpy(fi->last_name, name, len);
|
||||
fi->last_name[len] = 0;
|
||||
fi->next_offset = next_offset;
|
||||
dout("note_last_dentry '%s'\n", fi->last_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* When possible, we try to satisfy a readdir by peeking at the
|
||||
* dcache. We make this work by carefully ordering dentries on
|
||||
@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
|
||||
struct ceph_file_info *fi = file->private_data;
|
||||
struct dentry *parent = file->f_path.dentry;
|
||||
struct inode *dir = d_inode(parent);
|
||||
struct list_head *p;
|
||||
struct dentry *dentry, *last;
|
||||
struct dentry *dentry, *last = NULL;
|
||||
struct ceph_dentry_info *di;
|
||||
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
|
||||
int err = 0;
|
||||
loff_t ptr_pos = 0;
|
||||
struct ceph_readdir_cache_control cache_ctl = {};
|
||||
|
||||
/* claim ref on last dentry we returned */
|
||||
last = fi->dentry;
|
||||
fi->dentry = NULL;
|
||||
dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
|
||||
|
||||
dout("__dcache_readdir %p v%u at %llu (last %p)\n",
|
||||
dir, shared_gen, ctx->pos, last);
|
||||
|
||||
spin_lock(&parent->d_lock);
|
||||
|
||||
/* start at beginning? */
|
||||
if (ctx->pos == 2 || last == NULL ||
|
||||
fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
|
||||
if (list_empty(&parent->d_subdirs))
|
||||
goto out_unlock;
|
||||
p = parent->d_subdirs.prev;
|
||||
dout(" initial p %p/%p\n", p->prev, p->next);
|
||||
} else {
|
||||
p = last->d_child.prev;
|
||||
/* we can calculate cache index for the first dirfrag */
|
||||
if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
|
||||
cache_ctl.index = fpos_off(ctx->pos) - 2;
|
||||
BUG_ON(cache_ctl.index < 0);
|
||||
ptr_pos = cache_ctl.index * sizeof(struct dentry *);
|
||||
}
|
||||
|
||||
more:
|
||||
dentry = list_entry(p, struct dentry, d_child);
|
||||
di = ceph_dentry(dentry);
|
||||
while (1) {
|
||||
dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
|
||||
d_unhashed(dentry) ? "!hashed" : "hashed",
|
||||
parent->d_subdirs.prev, parent->d_subdirs.next);
|
||||
if (p == &parent->d_subdirs) {
|
||||
while (true) {
|
||||
pgoff_t pgoff;
|
||||
bool emit_dentry;
|
||||
|
||||
if (ptr_pos >= i_size_read(dir)) {
|
||||
fi->flags |= CEPH_F_ATEND;
|
||||
goto out_unlock;
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
|
||||
|
||||
err = -EAGAIN;
|
||||
pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
|
||||
if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
|
||||
ceph_readdir_cache_release(&cache_ctl);
|
||||
cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
|
||||
if (!cache_ctl.page) {
|
||||
dout(" page %lu not found\n", pgoff);
|
||||
break;
|
||||
}
|
||||
/* reading/filling the cache are serialized by
|
||||
* i_mutex, no need to use page lock */
|
||||
unlock_page(cache_ctl.page);
|
||||
cache_ctl.dentries = kmap(cache_ctl.page);
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
spin_lock(&parent->d_lock);
|
||||
/* check i_size again here, because empty directory can be
|
||||
* marked as complete while not holding the i_mutex. */
|
||||
if (ceph_dir_is_complete_ordered(dir) &&
|
||||
ptr_pos < i_size_read(dir))
|
||||
dentry = cache_ctl.dentries[cache_ctl.index % nsize];
|
||||
else
|
||||
dentry = NULL;
|
||||
spin_unlock(&parent->d_lock);
|
||||
if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
|
||||
dentry = NULL;
|
||||
rcu_read_unlock();
|
||||
if (!dentry)
|
||||
break;
|
||||
|
||||
emit_dentry = false;
|
||||
di = ceph_dentry(dentry);
|
||||
spin_lock(&dentry->d_lock);
|
||||
if (di->lease_shared_gen == shared_gen &&
|
||||
!d_unhashed(dentry) && d_really_is_positive(dentry) &&
|
||||
d_really_is_positive(dentry) &&
|
||||
ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
|
||||
ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
|
||||
fpos_cmp(ctx->pos, di->offset) <= 0)
|
||||
break;
|
||||
dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
|
||||
dentry, di->offset,
|
||||
ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
|
||||
!d_inode(dentry) ? " null" : "");
|
||||
spin_unlock(&dentry->d_lock);
|
||||
p = p->prev;
|
||||
dentry = list_entry(p, struct dentry, d_child);
|
||||
di = ceph_dentry(dentry);
|
||||
}
|
||||
|
||||
dget_dlock(dentry);
|
||||
spin_unlock(&dentry->d_lock);
|
||||
spin_unlock(&parent->d_lock);
|
||||
|
||||
/* make sure a dentry wasn't dropped while we didn't have parent lock */
|
||||
if (!ceph_dir_is_complete_ordered(dir)) {
|
||||
dout(" lost dir complete on %p; falling back to mds\n", dir);
|
||||
dput(dentry);
|
||||
err = -EAGAIN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
|
||||
dentry, dentry, d_inode(dentry));
|
||||
if (!dir_emit(ctx, dentry->d_name.name,
|
||||
dentry->d_name.len,
|
||||
ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino),
|
||||
d_inode(dentry)->i_mode >> 12)) {
|
||||
if (last) {
|
||||
/* remember our position */
|
||||
fi->dentry = last;
|
||||
fi->next_offset = fpos_off(di->offset);
|
||||
fpos_cmp(ctx->pos, di->offset) <= 0) {
|
||||
emit_dentry = true;
|
||||
}
|
||||
dput(dentry);
|
||||
return 0;
|
||||
spin_unlock(&dentry->d_lock);
|
||||
|
||||
if (emit_dentry) {
|
||||
dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
|
||||
dentry, dentry, d_inode(dentry));
|
||||
ctx->pos = di->offset;
|
||||
if (!dir_emit(ctx, dentry->d_name.name,
|
||||
dentry->d_name.len,
|
||||
ceph_translate_ino(dentry->d_sb,
|
||||
d_inode(dentry)->i_ino),
|
||||
d_inode(dentry)->i_mode >> 12)) {
|
||||
dput(dentry);
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
ctx->pos++;
|
||||
|
||||
if (last)
|
||||
dput(last);
|
||||
last = dentry;
|
||||
} else {
|
||||
dput(dentry);
|
||||
}
|
||||
|
||||
cache_ctl.index++;
|
||||
ptr_pos += sizeof(struct dentry *);
|
||||
}
|
||||
|
||||
ctx->pos = di->offset + 1;
|
||||
|
||||
if (last)
|
||||
dput(last);
|
||||
last = dentry;
|
||||
|
||||
spin_lock(&parent->d_lock);
|
||||
p = p->prev; /* advance to next dentry */
|
||||
goto more;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&parent->d_lock);
|
||||
out:
|
||||
if (last)
|
||||
ceph_readdir_cache_release(&cache_ctl);
|
||||
if (last) {
|
||||
int ret;
|
||||
di = ceph_dentry(last);
|
||||
ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
|
||||
fpos_off(di->offset) + 1);
|
||||
if (ret < 0)
|
||||
err = ret;
|
||||
dput(last);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* make note of the last dentry we read, so we can
|
||||
* continue at the same lexicographical point,
|
||||
* regardless of what dir changes take place on the
|
||||
* server.
|
||||
*/
|
||||
static int note_last_dentry(struct ceph_file_info *fi, const char *name,
|
||||
int len)
|
||||
{
|
||||
kfree(fi->last_name);
|
||||
fi->last_name = kmalloc(len+1, GFP_NOFS);
|
||||
if (!fi->last_name)
|
||||
return -ENOMEM;
|
||||
memcpy(fi->last_name, name, len);
|
||||
fi->last_name[len] = 0;
|
||||
dout("note_last_dentry '%s'\n", fi->last_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ceph_readdir(struct file *file, struct dir_context *ctx)
|
||||
{
|
||||
struct ceph_file_info *fi = file->private_data;
|
||||
@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
|
||||
|
||||
/* can we use the dcache? */
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if ((ctx->pos == 2 || fi->dentry) &&
|
||||
ceph_test_mount_opt(fsc, DCACHE) &&
|
||||
if (ceph_test_mount_opt(fsc, DCACHE) &&
|
||||
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
|
||||
ceph_snap(inode) != CEPH_SNAPDIR &&
|
||||
__ceph_dir_is_complete_ordered(ci) &&
|
||||
@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
|
||||
} else {
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
}
|
||||
if (fi->dentry) {
|
||||
err = note_last_dentry(fi, fi->dentry->d_name.name,
|
||||
fi->dentry->d_name.len);
|
||||
if (err)
|
||||
return err;
|
||||
dput(fi->dentry);
|
||||
fi->dentry = NULL;
|
||||
}
|
||||
|
||||
/* proceed with a normal readdir */
|
||||
|
||||
if (ctx->pos == 2) {
|
||||
/* note dir version at start of readdir so we can tell
|
||||
* if any dentries get dropped */
|
||||
fi->dir_release_count = atomic_read(&ci->i_release_count);
|
||||
fi->dir_ordered_count = ci->i_ordered_count;
|
||||
}
|
||||
|
||||
more:
|
||||
/* do we have the correct frag content buffered? */
|
||||
if (fi->frag != frag || fi->last_readdir == NULL) {
|
||||
@ -342,12 +336,15 @@ more:
|
||||
req->r_direct_hash = ceph_frag_value(frag);
|
||||
req->r_direct_is_hash = true;
|
||||
if (fi->last_name) {
|
||||
req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
|
||||
req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
|
||||
if (!req->r_path2) {
|
||||
ceph_mdsc_put_request(req);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
req->r_dir_release_cnt = fi->dir_release_count;
|
||||
req->r_dir_ordered_cnt = fi->dir_ordered_count;
|
||||
req->r_readdir_cache_idx = fi->readdir_cache_idx;
|
||||
req->r_readdir_offset = fi->next_offset;
|
||||
req->r_args.readdir.frag = cpu_to_le32(frag);
|
||||
|
||||
@ -364,26 +361,38 @@ more:
|
||||
(int)req->r_reply_info.dir_end,
|
||||
(int)req->r_reply_info.dir_complete);
|
||||
|
||||
if (!req->r_did_prepopulate) {
|
||||
dout("readdir !did_prepopulate");
|
||||
/* preclude from marking dir complete */
|
||||
fi->dir_release_count--;
|
||||
}
|
||||
|
||||
/* note next offset and last dentry name */
|
||||
rinfo = &req->r_reply_info;
|
||||
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
|
||||
frag = le32_to_cpu(rinfo->dir_dir->frag);
|
||||
if (ceph_frag_is_leftmost(frag))
|
||||
fi->next_offset = 2;
|
||||
else
|
||||
fi->next_offset = 0;
|
||||
off = fi->next_offset;
|
||||
off = req->r_readdir_offset;
|
||||
fi->next_offset = off;
|
||||
}
|
||||
|
||||
fi->frag = frag;
|
||||
fi->offset = fi->next_offset;
|
||||
fi->last_readdir = req;
|
||||
|
||||
if (req->r_did_prepopulate) {
|
||||
fi->readdir_cache_idx = req->r_readdir_cache_idx;
|
||||
if (fi->readdir_cache_idx < 0) {
|
||||
/* preclude from marking dir ordered */
|
||||
fi->dir_ordered_count = 0;
|
||||
} else if (ceph_frag_is_leftmost(frag) && off == 2) {
|
||||
/* note dir version at start of readdir so
|
||||
* we can tell if any dentries get dropped */
|
||||
fi->dir_release_count = req->r_dir_release_cnt;
|
||||
fi->dir_ordered_count = req->r_dir_ordered_cnt;
|
||||
}
|
||||
} else {
|
||||
dout("readdir !did_prepopulate");
|
||||
/* disable readdir cache */
|
||||
fi->readdir_cache_idx = -1;
|
||||
/* preclude from marking dir complete */
|
||||
fi->dir_release_count = 0;
|
||||
}
|
||||
|
||||
if (req->r_reply_info.dir_end) {
|
||||
kfree(fi->last_name);
|
||||
fi->last_name = NULL;
|
||||
@ -394,10 +403,10 @@ more:
|
||||
} else {
|
||||
err = note_last_dentry(fi,
|
||||
rinfo->dir_dname[rinfo->dir_nr-1],
|
||||
rinfo->dir_dname_len[rinfo->dir_nr-1]);
|
||||
rinfo->dir_dname_len[rinfo->dir_nr-1],
|
||||
fi->next_offset + rinfo->dir_nr);
|
||||
if (err)
|
||||
return err;
|
||||
fi->next_offset += rinfo->dir_nr;
|
||||
}
|
||||
}
|
||||
|
||||
@ -453,16 +462,22 @@ more:
|
||||
* were released during the whole readdir, and we should have
|
||||
* the complete dir contents in our cache.
|
||||
*/
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
|
||||
if (ci->i_ordered_count == fi->dir_ordered_count)
|
||||
if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
|
||||
dout(" marking %p complete and ordered\n", inode);
|
||||
else
|
||||
/* use i_size to track number of entries in
|
||||
* readdir cache */
|
||||
BUG_ON(fi->readdir_cache_idx < 0);
|
||||
i_size_write(inode, fi->readdir_cache_idx *
|
||||
sizeof(struct dentry*));
|
||||
} else {
|
||||
dout(" marking %p complete\n", inode);
|
||||
}
|
||||
__ceph_dir_set_complete(ci, fi->dir_release_count,
|
||||
fi->dir_ordered_count);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
}
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
dout("readdir %p file %p done.\n", inode, file);
|
||||
return 0;
|
||||
@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
|
||||
}
|
||||
kfree(fi->last_name);
|
||||
fi->last_name = NULL;
|
||||
fi->dir_release_count = 0;
|
||||
fi->readdir_cache_idx = -1;
|
||||
if (ceph_frag_is_leftmost(frag))
|
||||
fi->next_offset = 2; /* compensate for . and .. */
|
||||
else
|
||||
fi->next_offset = 0;
|
||||
if (fi->dentry) {
|
||||
dput(fi->dentry);
|
||||
fi->dentry = NULL;
|
||||
}
|
||||
fi->flags &= ~CEPH_F_ATEND;
|
||||
}
|
||||
|
||||
@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
|
||||
mutex_lock(&inode->i_mutex);
|
||||
retval = -EINVAL;
|
||||
switch (whence) {
|
||||
case SEEK_END:
|
||||
offset += inode->i_size + 2; /* FIXME */
|
||||
break;
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
case SEEK_END:
|
||||
retval = -EOPNOTSUPP;
|
||||
default:
|
||||
goto out;
|
||||
}
|
||||
@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
|
||||
}
|
||||
retval = offset;
|
||||
|
||||
/*
|
||||
* discard buffered readdir content on seekdir(0), or
|
||||
* seek to new frag, or seek prior to current chunk.
|
||||
*/
|
||||
if (offset == 0 ||
|
||||
fpos_frag(offset) != fi->frag ||
|
||||
fpos_off(offset) < fi->offset) {
|
||||
/* discard buffered readdir content on seekdir(0), or
|
||||
* seek to new frag, or seek prior to current chunk */
|
||||
dout("dir_llseek dropping %p content\n", file);
|
||||
reset_readdir(fi, fpos_frag(offset));
|
||||
} else if (fpos_cmp(offset, old_offset) > 0) {
|
||||
/* reset dir_release_count if we did a forward seek */
|
||||
fi->dir_release_count = 0;
|
||||
fi->readdir_cache_idx = -1;
|
||||
}
|
||||
|
||||
/* bump dir_release_count if we did a forward seek */
|
||||
if (fpos_cmp(offset, old_offset) > 0)
|
||||
fi->dir_release_count--;
|
||||
}
|
||||
out:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
|
||||
err = PTR_ERR(req);
|
||||
goto out;
|
||||
}
|
||||
req->r_path2 = kstrdup(dest, GFP_NOFS);
|
||||
req->r_path2 = kstrdup(dest, GFP_KERNEL);
|
||||
if (!req->r_path2) {
|
||||
err = -ENOMEM;
|
||||
ceph_mdsc_put_request(req);
|
||||
@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
|
||||
* to do it here.
|
||||
*/
|
||||
|
||||
/* d_move screws up sibling dentries' offsets */
|
||||
ceph_dir_clear_complete(old_dir);
|
||||
ceph_dir_clear_complete(new_dir);
|
||||
|
||||
d_move(old_dentry, new_dentry);
|
||||
|
||||
/* ensure target dentry is invalidated, despite
|
||||
rehashing bug in vfs_rename_dir */
|
||||
ceph_invalidate_dentry_lease(new_dentry);
|
||||
|
||||
/* d_move screws up sibling dentries' offsets */
|
||||
ceph_dir_clear_complete(old_dir);
|
||||
ceph_dir_clear_complete(new_dir);
|
||||
|
||||
}
|
||||
ceph_mdsc_put_request(req);
|
||||
return err;
|
||||
@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
|
||||
return -EISDIR;
|
||||
|
||||
if (!cf->dir_info) {
|
||||
cf->dir_info = kmalloc(bufsize, GFP_NOFS);
|
||||
cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
|
||||
if (!cf->dir_info)
|
||||
return -ENOMEM;
|
||||
cf->dir_info_len =
|
||||
@ -1223,66 +1232,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
|
||||
return size - left;
|
||||
}
|
||||
|
||||
/*
|
||||
* an fsync() on a dir will wait for any uncommitted directory
|
||||
* operations to commit.
|
||||
*/
|
||||
static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct list_head *head = &ci->i_unsafe_dirops;
|
||||
struct ceph_mds_request *req;
|
||||
u64 last_tid;
|
||||
int ret = 0;
|
||||
|
||||
dout("dir_fsync %p\n", inode);
|
||||
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
|
||||
if (ret)
|
||||
return ret;
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
if (list_empty(head))
|
||||
goto out;
|
||||
|
||||
req = list_entry(head->prev,
|
||||
struct ceph_mds_request, r_unsafe_dir_item);
|
||||
last_tid = req->r_tid;
|
||||
|
||||
do {
|
||||
ceph_mdsc_get_request(req);
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
|
||||
dout("dir_fsync %p wait on tid %llu (until %llu)\n",
|
||||
inode, req->r_tid, last_tid);
|
||||
if (req->r_timeout) {
|
||||
unsigned long time_left = wait_for_completion_timeout(
|
||||
&req->r_safe_completion,
|
||||
req->r_timeout);
|
||||
if (time_left > 0)
|
||||
ret = 0;
|
||||
else
|
||||
ret = -EIO; /* timed out */
|
||||
} else {
|
||||
wait_for_completion(&req->r_safe_completion);
|
||||
}
|
||||
ceph_mdsc_put_request(req);
|
||||
|
||||
spin_lock(&ci->i_unsafe_lock);
|
||||
if (ret || list_empty(head))
|
||||
break;
|
||||
req = list_entry(head->next,
|
||||
struct ceph_mds_request, r_unsafe_dir_item);
|
||||
} while (req->r_tid < last_tid);
|
||||
out:
|
||||
spin_unlock(&ci->i_unsafe_lock);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* We maintain a private dentry LRU.
|
||||
*
|
||||
@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = {
|
||||
.open = ceph_open,
|
||||
.release = ceph_release,
|
||||
.unlocked_ioctl = ceph_ioctl,
|
||||
.fsync = ceph_dir_fsync,
|
||||
.fsync = ceph_fsync,
|
||||
};
|
||||
|
||||
const struct file_operations ceph_snapdir_fops = {
|
||||
|
@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
|
||||
case S_IFDIR:
|
||||
dout("init_file %p %p 0%o (regular)\n", inode, file,
|
||||
inode->i_mode);
|
||||
cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
|
||||
cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO);
|
||||
if (cf == NULL) {
|
||||
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
|
||||
return -ENOMEM;
|
||||
}
|
||||
cf->fmode = fmode;
|
||||
cf->next_offset = 2;
|
||||
cf->readdir_cache_idx = -1;
|
||||
file->private_data = cf;
|
||||
BUG_ON(inode->i_fop->release != ceph_release);
|
||||
break;
|
||||
@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
|
||||
ceph_mdsc_put_request(cf->last_readdir);
|
||||
kfree(cf->last_name);
|
||||
kfree(cf->dir_info);
|
||||
dput(cf->dentry);
|
||||
kmem_cache_free(ceph_file_cachep, cf);
|
||||
|
||||
/* wake up anyone waiting for caps on this inode */
|
||||
@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
|
||||
}
|
||||
} else {
|
||||
num_pages = calc_pages_for(off, len);
|
||||
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
|
||||
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
|
||||
if (IS_ERR(pages))
|
||||
return PTR_ERR(pages);
|
||||
ret = striped_read(inode, off, len, pages,
|
||||
@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
|
||||
* objects, rollback on failure, etc.)
|
||||
*/
|
||||
static ssize_t
|
||||
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
|
||||
struct ceph_snap_context *snapc)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_snap_context *snapc;
|
||||
struct ceph_vino vino;
|
||||
struct ceph_osd_request *req;
|
||||
struct page **pages;
|
||||
@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
size_t start;
|
||||
ssize_t n;
|
||||
|
||||
snapc = ci->i_snap_realm->cached_context;
|
||||
vino = ceph_vino(inode);
|
||||
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
||||
vino, pos, &len, 0,
|
||||
@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
break;
|
||||
}
|
||||
|
||||
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
|
||||
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
|
||||
|
||||
n = iov_iter_get_pages_alloc(from, &pages, len, &start);
|
||||
if (unlikely(n < 0)) {
|
||||
@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
* objects, rollback on failure, etc.)
|
||||
*/
|
||||
static ssize_t
|
||||
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
|
||||
struct ceph_snap_context *snapc)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
|
||||
struct ceph_snap_context *snapc;
|
||||
struct ceph_vino vino;
|
||||
struct ceph_osd_request *req;
|
||||
struct page **pages;
|
||||
@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
size_t left;
|
||||
int n;
|
||||
|
||||
snapc = ci->i_snap_realm->cached_context;
|
||||
vino = ceph_vino(inode);
|
||||
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
||||
vino, pos, &len, 0, 1,
|
||||
@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
|
||||
*/
|
||||
num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
|
||||
|
||||
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
|
||||
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
|
||||
if (IS_ERR(pages)) {
|
||||
ret = PTR_ERR(pages);
|
||||
goto out;
|
||||
@ -860,7 +858,7 @@ again:
|
||||
struct page *page = NULL;
|
||||
loff_t i_size;
|
||||
if (retry_op == READ_INLINE) {
|
||||
page = __page_cache_alloc(GFP_NOFS);
|
||||
page = __page_cache_alloc(GFP_KERNEL);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
}
|
||||
@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_osd_client *osdc =
|
||||
&ceph_sb_to_client(inode->i_sb)->client->osdc;
|
||||
struct ceph_cap_flush *prealloc_cf;
|
||||
ssize_t count, written = 0;
|
||||
int err, want, got;
|
||||
loff_t pos;
|
||||
@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
if (ceph_snap(inode) != CEPH_NOSNAP)
|
||||
return -EROFS;
|
||||
|
||||
prealloc_cf = ceph_alloc_cap_flush();
|
||||
if (!prealloc_cf)
|
||||
return -ENOMEM;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
/* We can write back this queue in page reclaim */
|
||||
@ -996,14 +999,30 @@ retry_snap:
|
||||
|
||||
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
|
||||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
|
||||
struct ceph_snap_context *snapc;
|
||||
struct iov_iter data;
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (__ceph_have_pending_cap_snap(ci)) {
|
||||
struct ceph_cap_snap *capsnap =
|
||||
list_last_entry(&ci->i_cap_snaps,
|
||||
struct ceph_cap_snap,
|
||||
ci_item);
|
||||
snapc = ceph_get_snap_context(capsnap->context);
|
||||
} else {
|
||||
BUG_ON(!ci->i_head_snapc);
|
||||
snapc = ceph_get_snap_context(ci->i_head_snapc);
|
||||
}
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
/* we might need to revert back to that point */
|
||||
data = *from;
|
||||
if (iocb->ki_flags & IOCB_DIRECT)
|
||||
written = ceph_sync_direct_write(iocb, &data, pos);
|
||||
written = ceph_sync_direct_write(iocb, &data, pos,
|
||||
snapc);
|
||||
else
|
||||
written = ceph_sync_write(iocb, &data, pos);
|
||||
written = ceph_sync_write(iocb, &data, pos, snapc);
|
||||
if (written == -EOLDSNAPC) {
|
||||
dout("aio_write %p %llx.%llx %llu~%u"
|
||||
"got EOLDSNAPC, retrying\n",
|
||||
@ -1014,6 +1033,7 @@ retry_snap:
|
||||
}
|
||||
if (written > 0)
|
||||
iov_iter_advance(from, written);
|
||||
ceph_put_snap_context(snapc);
|
||||
} else {
|
||||
loff_t old_size = inode->i_size;
|
||||
/*
|
||||
@ -1035,7 +1055,8 @@ retry_snap:
|
||||
int dirty;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
ci->i_inline_version = CEPH_INLINE_NONE;
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
|
||||
&prealloc_cf);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
if (dirty)
|
||||
__mark_inode_dirty(inode, dirty);
|
||||
@ -1059,6 +1080,7 @@ retry_snap:
|
||||
out:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
out_unlocked:
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
current->backing_dev_info = NULL;
|
||||
return written ? written : err;
|
||||
}
|
||||
@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_osd_client *osdc =
|
||||
&ceph_inode_to_client(inode)->client->osdc;
|
||||
struct ceph_cap_flush *prealloc_cf;
|
||||
int want, got = 0;
|
||||
int dirty;
|
||||
int ret = 0;
|
||||
@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
|
||||
if (!S_ISREG(inode->i_mode))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
prealloc_cf = ceph_alloc_cap_flush();
|
||||
if (!prealloc_cf)
|
||||
return -ENOMEM;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
|
||||
if (ceph_snap(inode) != CEPH_NOSNAP) {
|
||||
@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
|
||||
if (!ret) {
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
ci->i_inline_version = CEPH_INLINE_NONE;
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
|
||||
&prealloc_cf);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
if (dirty)
|
||||
__mark_inode_dirty(inode, dirty);
|
||||
@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
|
||||
ceph_put_cap_refs(ci, got);
|
||||
unlock:
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
155
fs/ceph/inode.c
155
fs/ceph/inode.c
@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
|
||||
ci->i_inline_version = 0;
|
||||
ci->i_time_warp_seq = 0;
|
||||
ci->i_ceph_flags = 0;
|
||||
ci->i_ordered_count = 0;
|
||||
atomic_set(&ci->i_release_count, 1);
|
||||
atomic_set(&ci->i_complete_count, 0);
|
||||
atomic64_set(&ci->i_ordered_count, 1);
|
||||
atomic64_set(&ci->i_release_count, 1);
|
||||
atomic64_set(&ci->i_complete_seq[0], 0);
|
||||
atomic64_set(&ci->i_complete_seq[1], 0);
|
||||
ci->i_symlink = NULL;
|
||||
|
||||
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
|
||||
@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
|
||||
ci->i_flushing_caps = 0;
|
||||
INIT_LIST_HEAD(&ci->i_dirty_item);
|
||||
INIT_LIST_HEAD(&ci->i_flushing_item);
|
||||
ci->i_cap_flush_seq = 0;
|
||||
ci->i_cap_flush_last_tid = 0;
|
||||
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
|
||||
ci->i_prealloc_cap_flush = NULL;
|
||||
ci->i_cap_flush_tree = RB_ROOT;
|
||||
init_waitqueue_head(&ci->i_cap_wq);
|
||||
ci->i_hold_caps_min = 0;
|
||||
ci->i_hold_caps_max = 0;
|
||||
@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
|
||||
|
||||
if (new_version ||
|
||||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
|
||||
if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
|
||||
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
|
||||
ci->i_layout = info->layout;
|
||||
|
||||
queue_trunc = ceph_fill_file_size(inode, issued,
|
||||
le32_to_cpu(info->truncate_seq),
|
||||
le64_to_cpu(info->truncate_size),
|
||||
@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
|
||||
(issued & CEPH_CAP_FILE_EXCL) == 0 &&
|
||||
!__ceph_dir_is_complete(ci)) {
|
||||
dout(" marking %p complete (empty)\n", inode);
|
||||
i_size_write(inode, 0);
|
||||
__ceph_dir_set_complete(ci,
|
||||
atomic_read(&ci->i_release_count),
|
||||
ci->i_ordered_count);
|
||||
atomic64_read(&ci->i_release_count),
|
||||
atomic64_read(&ci->i_ordered_count));
|
||||
}
|
||||
|
||||
wake = true;
|
||||
@ -1212,6 +1216,10 @@ retry_lookup:
|
||||
dout("fill_trace doing d_move %p -> %p\n",
|
||||
req->r_old_dentry, dn);
|
||||
|
||||
/* d_move screws up sibling dentries' offsets */
|
||||
ceph_dir_clear_ordered(dir);
|
||||
ceph_dir_clear_ordered(olddir);
|
||||
|
||||
d_move(req->r_old_dentry, dn);
|
||||
dout(" src %p '%pd' dst %p '%pd'\n",
|
||||
req->r_old_dentry,
|
||||
@ -1222,10 +1230,6 @@ retry_lookup:
|
||||
rehashing bug in vfs_rename_dir */
|
||||
ceph_invalidate_dentry_lease(dn);
|
||||
|
||||
/* d_move screws up sibling dentries' offsets */
|
||||
ceph_dir_clear_ordered(dir);
|
||||
ceph_dir_clear_ordered(olddir);
|
||||
|
||||
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
|
||||
ceph_dentry(req->r_old_dentry)->offset);
|
||||
|
||||
@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
|
||||
return err;
|
||||
}
|
||||
|
||||
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
|
||||
{
|
||||
if (ctl->page) {
|
||||
kunmap(ctl->page);
|
||||
page_cache_release(ctl->page);
|
||||
ctl->page = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
|
||||
struct ceph_readdir_cache_control *ctl,
|
||||
struct ceph_mds_request *req)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(dir);
|
||||
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
|
||||
unsigned idx = ctl->index % nsize;
|
||||
pgoff_t pgoff = ctl->index / nsize;
|
||||
|
||||
if (!ctl->page || pgoff != page_index(ctl->page)) {
|
||||
ceph_readdir_cache_release(ctl);
|
||||
ctl->page = grab_cache_page(&dir->i_data, pgoff);
|
||||
if (!ctl->page) {
|
||||
ctl->index = -1;
|
||||
return -ENOMEM;
|
||||
}
|
||||
/* reading/filling the cache are serialized by
|
||||
* i_mutex, no need to use page lock */
|
||||
unlock_page(ctl->page);
|
||||
ctl->dentries = kmap(ctl->page);
|
||||
}
|
||||
|
||||
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
|
||||
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
|
||||
dout("readdir cache dn %p idx %d\n", dn, ctl->index);
|
||||
ctl->dentries[idx] = dn;
|
||||
ctl->index++;
|
||||
} else {
|
||||
dout("disable readdir cache\n");
|
||||
ctl->index = -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ceph_readdir_prepopulate(struct ceph_mds_request *req,
|
||||
struct ceph_mds_session *session)
|
||||
{
|
||||
@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
|
||||
struct inode *snapdir = NULL;
|
||||
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
|
||||
struct ceph_dentry_info *di;
|
||||
u64 r_readdir_offset = req->r_readdir_offset;
|
||||
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
|
||||
struct ceph_readdir_cache_control cache_ctl = {};
|
||||
|
||||
if (req->r_aborted)
|
||||
return readdir_prepopulate_inodes_only(req, session);
|
||||
|
||||
if (rinfo->dir_dir &&
|
||||
le32_to_cpu(rinfo->dir_dir->frag) != frag) {
|
||||
@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
|
||||
frag, le32_to_cpu(rinfo->dir_dir->frag));
|
||||
frag = le32_to_cpu(rinfo->dir_dir->frag);
|
||||
if (ceph_frag_is_leftmost(frag))
|
||||
r_readdir_offset = 2;
|
||||
req->r_readdir_offset = 2;
|
||||
else
|
||||
r_readdir_offset = 0;
|
||||
req->r_readdir_offset = 0;
|
||||
}
|
||||
|
||||
if (req->r_aborted)
|
||||
return readdir_prepopulate_inodes_only(req, session);
|
||||
|
||||
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
|
||||
snapdir = ceph_get_snapdir(d_inode(parent));
|
||||
parent = d_find_alias(snapdir);
|
||||
@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
|
||||
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
|
||||
}
|
||||
|
||||
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
|
||||
/* note dir version at start of readdir so we can tell
|
||||
* if any dentries get dropped */
|
||||
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
|
||||
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
|
||||
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
|
||||
req->r_readdir_cache_idx = 0;
|
||||
}
|
||||
|
||||
cache_ctl.index = req->r_readdir_cache_idx;
|
||||
|
||||
/* FIXME: release caps/leases if error occurs */
|
||||
for (i = 0; i < rinfo->dir_nr; i++) {
|
||||
struct ceph_vino vino;
|
||||
@ -1413,13 +1471,6 @@ retry_lookup:
|
||||
d_delete(dn);
|
||||
dput(dn);
|
||||
goto retry_lookup;
|
||||
} else {
|
||||
/* reorder parent's d_subdirs */
|
||||
spin_lock(&parent->d_lock);
|
||||
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
|
||||
list_move(&dn->d_child, &parent->d_subdirs);
|
||||
spin_unlock(&dn->d_lock);
|
||||
spin_unlock(&parent->d_lock);
|
||||
}
|
||||
|
||||
/* inode */
|
||||
@ -1436,13 +1487,15 @@ retry_lookup:
|
||||
}
|
||||
}
|
||||
|
||||
if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
|
||||
req->r_request_started, -1,
|
||||
&req->r_caps_reservation) < 0) {
|
||||
ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
|
||||
req->r_request_started, -1,
|
||||
&req->r_caps_reservation);
|
||||
if (ret < 0) {
|
||||
pr_err("fill_inode badness on %p\n", in);
|
||||
if (d_really_is_negative(dn))
|
||||
iput(in);
|
||||
d_drop(dn);
|
||||
err = ret;
|
||||
goto next_item;
|
||||
}
|
||||
|
||||
@ -1458,19 +1511,28 @@ retry_lookup:
|
||||
}
|
||||
|
||||
di = dn->d_fsdata;
|
||||
di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
|
||||
di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
|
||||
|
||||
update_dentry_lease(dn, rinfo->dir_dlease[i],
|
||||
req->r_session,
|
||||
req->r_request_started);
|
||||
|
||||
if (err == 0 && cache_ctl.index >= 0) {
|
||||
ret = fill_readdir_cache(d_inode(parent), dn,
|
||||
&cache_ctl, req);
|
||||
if (ret < 0)
|
||||
err = ret;
|
||||
}
|
||||
next_item:
|
||||
if (dn)
|
||||
dput(dn);
|
||||
}
|
||||
if (err == 0)
|
||||
req->r_did_prepopulate = true;
|
||||
|
||||
out:
|
||||
if (err == 0) {
|
||||
req->r_did_prepopulate = true;
|
||||
req->r_readdir_cache_idx = cache_ctl.index;
|
||||
}
|
||||
ceph_readdir_cache_release(&cache_ctl);
|
||||
if (snapdir) {
|
||||
iput(snapdir);
|
||||
dput(parent);
|
||||
@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
const unsigned int ia_valid = attr->ia_valid;
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
|
||||
struct ceph_cap_flush *prealloc_cf;
|
||||
int issued;
|
||||
int release = 0, dirtied = 0;
|
||||
int mask = 0;
|
||||
int err = 0;
|
||||
int inode_dirty_flags = 0;
|
||||
bool lock_snap_rwsem = false;
|
||||
|
||||
if (ceph_snap(inode) != CEPH_NOSNAP)
|
||||
return -EROFS;
|
||||
@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
if (err != 0)
|
||||
return err;
|
||||
|
||||
prealloc_cf = ceph_alloc_cap_flush();
|
||||
if (!prealloc_cf)
|
||||
return -ENOMEM;
|
||||
|
||||
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
|
||||
USE_AUTH_MDS);
|
||||
if (IS_ERR(req))
|
||||
if (IS_ERR(req)) {
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
return PTR_ERR(req);
|
||||
}
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
issued = __ceph_caps_issued(ci, NULL);
|
||||
|
||||
if (!ci->i_head_snapc &&
|
||||
(issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
|
||||
lock_snap_rwsem = true;
|
||||
if (!down_read_trylock(&mdsc->snap_rwsem)) {
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
down_read(&mdsc->snap_rwsem);
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
issued = __ceph_caps_issued(ci, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
|
||||
|
||||
if (ia_valid & ATTR_UID) {
|
||||
@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
dout("setattr %p ATTR_FILE ... hrm!\n", inode);
|
||||
|
||||
if (dirtied) {
|
||||
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
|
||||
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
|
||||
&prealloc_cf);
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
}
|
||||
|
||||
release &= issued;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
if (lock_snap_rwsem)
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
|
||||
if (inode_dirty_flags)
|
||||
__mark_inode_dirty(inode, inode_dirty_flags);
|
||||
@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
ceph_mdsc_put_request(req);
|
||||
if (mask & CEPH_SETATTR_SIZE)
|
||||
__ceph_do_pending_vmtruncate(inode);
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
return err;
|
||||
out_put:
|
||||
ceph_mdsc_put_request(req);
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/ratelimit.h>
|
||||
|
||||
#include "super.h"
|
||||
#include "mds_client.h"
|
||||
@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
|
||||
s->s_cap_reconnect = 0;
|
||||
s->s_cap_iterator = NULL;
|
||||
INIT_LIST_HEAD(&s->s_cap_releases);
|
||||
INIT_LIST_HEAD(&s->s_cap_releases_done);
|
||||
INIT_LIST_HEAD(&s->s_cap_flushing);
|
||||
INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
|
||||
|
||||
@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
|
||||
req->r_uid = current_fsuid();
|
||||
req->r_gid = current_fsgid();
|
||||
|
||||
if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
|
||||
mdsc->oldest_tid = req->r_tid;
|
||||
|
||||
if (dir) {
|
||||
struct ceph_inode_info *ci = ceph_inode(dir);
|
||||
|
||||
@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_request *req)
|
||||
{
|
||||
dout("__unregister_request %p tid %lld\n", req, req->r_tid);
|
||||
|
||||
if (req->r_tid == mdsc->oldest_tid) {
|
||||
struct rb_node *p = rb_next(&req->r_node);
|
||||
mdsc->oldest_tid = 0;
|
||||
while (p) {
|
||||
struct ceph_mds_request *next_req =
|
||||
rb_entry(p, struct ceph_mds_request, r_node);
|
||||
if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
|
||||
mdsc->oldest_tid = next_req->r_tid;
|
||||
break;
|
||||
}
|
||||
p = rb_next(p);
|
||||
}
|
||||
}
|
||||
|
||||
rb_erase(&req->r_node, &mdsc->request_tree);
|
||||
RB_CLEAR_NODE(&req->r_node);
|
||||
|
||||
@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
|
||||
* session caps
|
||||
*/
|
||||
|
||||
/*
|
||||
* Free preallocated cap messages assigned to this session
|
||||
*/
|
||||
static void cleanup_cap_releases(struct ceph_mds_session *session)
|
||||
/* caller holds s_cap_lock, we drop it */
|
||||
static void cleanup_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session)
|
||||
__releases(session->s_cap_lock)
|
||||
{
|
||||
struct ceph_msg *msg;
|
||||
|
||||
spin_lock(&session->s_cap_lock);
|
||||
while (!list_empty(&session->s_cap_releases)) {
|
||||
msg = list_first_entry(&session->s_cap_releases,
|
||||
struct ceph_msg, list_head);
|
||||
list_del_init(&msg->list_head);
|
||||
ceph_msg_put(msg);
|
||||
}
|
||||
while (!list_empty(&session->s_cap_releases_done)) {
|
||||
msg = list_first_entry(&session->s_cap_releases_done,
|
||||
struct ceph_msg, list_head);
|
||||
list_del_init(&msg->list_head);
|
||||
ceph_msg_put(msg);
|
||||
}
|
||||
LIST_HEAD(tmp_list);
|
||||
list_splice_init(&session->s_cap_releases, &tmp_list);
|
||||
session->s_num_cap_releases = 0;
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
|
||||
dout("cleanup_cap_releases mds%d\n", session->s_mds);
|
||||
while (!list_empty(&tmp_list)) {
|
||||
struct ceph_cap *cap;
|
||||
/* zero out the in-progress message */
|
||||
cap = list_first_entry(&tmp_list,
|
||||
struct ceph_cap, session_caps);
|
||||
list_del(&cap->session_caps);
|
||||
ceph_put_cap(mdsc, cap);
|
||||
}
|
||||
}
|
||||
|
||||
static void cleanup_session_requests(struct ceph_mds_client *mdsc,
|
||||
@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
|
||||
req = list_first_entry(&session->s_unsafe,
|
||||
struct ceph_mds_request, r_unsafe_item);
|
||||
list_del_init(&req->r_unsafe_item);
|
||||
pr_info(" dropping unsafe request %llu\n", req->r_tid);
|
||||
pr_warn_ratelimited(" dropping unsafe request %llu\n",
|
||||
req->r_tid);
|
||||
__unregister_request(mdsc, req);
|
||||
}
|
||||
/* zero r_attempts, so kick_requests() will re-send requests */
|
||||
@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session,
|
||||
dout("iterate_session_caps finishing cap %p removal\n",
|
||||
cap);
|
||||
BUG_ON(cap->session != session);
|
||||
cap->session = NULL;
|
||||
list_del_init(&cap->session_caps);
|
||||
session->s_nr_caps--;
|
||||
cap->session = NULL;
|
||||
old_cap = cap; /* put_cap it w/o locks held */
|
||||
if (cap->queue_release) {
|
||||
list_add_tail(&cap->session_caps,
|
||||
&session->s_cap_releases);
|
||||
session->s_num_cap_releases++;
|
||||
} else {
|
||||
old_cap = cap; /* put_cap it w/o locks held */
|
||||
}
|
||||
}
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||
void *arg)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
LIST_HEAD(to_remove);
|
||||
int drop = 0;
|
||||
|
||||
dout("removing cap %p, ci is %p, inode is %p\n",
|
||||
@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
__ceph_remove_cap(cap, false);
|
||||
if (!ci->i_auth_cap) {
|
||||
struct ceph_cap_flush *cf;
|
||||
struct ceph_mds_client *mdsc =
|
||||
ceph_sb_to_client(inode->i_sb)->mdsc;
|
||||
|
||||
while (true) {
|
||||
struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
|
||||
if (!n)
|
||||
break;
|
||||
cf = rb_entry(n, struct ceph_cap_flush, i_node);
|
||||
rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
|
||||
list_add(&cf->list, &to_remove);
|
||||
}
|
||||
|
||||
spin_lock(&mdsc->cap_dirty_lock);
|
||||
|
||||
list_for_each_entry(cf, &to_remove, list)
|
||||
rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
|
||||
|
||||
if (!list_empty(&ci->i_dirty_item)) {
|
||||
pr_info(" dropping dirty %s state for %p %lld\n",
|
||||
pr_warn_ratelimited(
|
||||
" dropping dirty %s state for %p %lld\n",
|
||||
ceph_cap_string(ci->i_dirty_caps),
|
||||
inode, ceph_ino(inode));
|
||||
ci->i_dirty_caps = 0;
|
||||
@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||
drop = 1;
|
||||
}
|
||||
if (!list_empty(&ci->i_flushing_item)) {
|
||||
pr_info(" dropping dirty+flushing %s state for %p %lld\n",
|
||||
pr_warn_ratelimited(
|
||||
" dropping dirty+flushing %s state for %p %lld\n",
|
||||
ceph_cap_string(ci->i_flushing_caps),
|
||||
inode, ceph_ino(inode));
|
||||
ci->i_flushing_caps = 0;
|
||||
@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
|
||||
drop = 1;
|
||||
}
|
||||
spin_unlock(&mdsc->cap_dirty_lock);
|
||||
|
||||
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
|
||||
list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
|
||||
ci->i_prealloc_cap_flush = NULL;
|
||||
}
|
||||
}
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
while (!list_empty(&to_remove)) {
|
||||
struct ceph_cap_flush *cf;
|
||||
cf = list_first_entry(&to_remove,
|
||||
struct ceph_cap_flush, list);
|
||||
list_del(&cf->list);
|
||||
ceph_free_cap_flush(cf);
|
||||
}
|
||||
while (drop--)
|
||||
iput(inode);
|
||||
return 0;
|
||||
@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session)
|
||||
spin_lock(&session->s_cap_lock);
|
||||
}
|
||||
}
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
|
||||
// drop cap expires and unlock s_cap_lock
|
||||
cleanup_cap_releases(session->s_mdsc, session);
|
||||
|
||||
BUG_ON(session->s_nr_caps > 0);
|
||||
BUG_ON(!list_empty(&session->s_cap_flushing));
|
||||
cleanup_cap_releases(session);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
|
||||
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
|
||||
ceph_cap_string(used), ceph_cap_string(wanted));
|
||||
if (cap == ci->i_auth_cap) {
|
||||
if (ci->i_dirty_caps | ci->i_flushing_caps)
|
||||
if (ci->i_dirty_caps || ci->i_flushing_caps ||
|
||||
!list_empty(&ci->i_cap_snaps))
|
||||
goto out;
|
||||
if ((used | wanted) & CEPH_CAP_ANY_WR)
|
||||
goto out;
|
||||
@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc,
|
||||
session->s_trim_caps = 0;
|
||||
}
|
||||
|
||||
ceph_add_cap_releases(mdsc, session);
|
||||
ceph_send_cap_releases(mdsc, session);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate cap_release messages. If there is a partially full message
|
||||
* in the queue, try to allocate enough to cover it's remainder, so that
|
||||
* we can send it immediately.
|
||||
*
|
||||
* Called under s_mutex.
|
||||
*/
|
||||
int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session)
|
||||
static int check_capsnap_flush(struct ceph_inode_info *ci,
|
||||
u64 want_snap_seq)
|
||||
{
|
||||
struct ceph_msg *msg, *partial = NULL;
|
||||
struct ceph_mds_cap_release *head;
|
||||
int err = -ENOMEM;
|
||||
int extra = mdsc->fsc->mount_options->cap_release_safety;
|
||||
int num;
|
||||
|
||||
dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
|
||||
extra);
|
||||
|
||||
spin_lock(&session->s_cap_lock);
|
||||
|
||||
if (!list_empty(&session->s_cap_releases)) {
|
||||
msg = list_first_entry(&session->s_cap_releases,
|
||||
struct ceph_msg,
|
||||
list_head);
|
||||
head = msg->front.iov_base;
|
||||
num = le32_to_cpu(head->num);
|
||||
if (num) {
|
||||
dout(" partial %p with (%d/%d)\n", msg, num,
|
||||
(int)CEPH_CAPS_PER_RELEASE);
|
||||
extra += CEPH_CAPS_PER_RELEASE - num;
|
||||
partial = msg;
|
||||
}
|
||||
int ret = 1;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
|
||||
struct ceph_cap_snap *capsnap =
|
||||
list_first_entry(&ci->i_cap_snaps,
|
||||
struct ceph_cap_snap, ci_item);
|
||||
ret = capsnap->follows >= want_snap_seq;
|
||||
}
|
||||
while (session->s_num_cap_releases < session->s_nr_caps + extra) {
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
|
||||
GFP_NOFS, false);
|
||||
if (!msg)
|
||||
goto out_unlocked;
|
||||
dout("add_cap_releases %p msg %p now %d\n", session, msg,
|
||||
(int)msg->front.iov_len);
|
||||
head = msg->front.iov_base;
|
||||
head->num = cpu_to_le32(0);
|
||||
msg->front.iov_len = sizeof(*head);
|
||||
spin_lock(&session->s_cap_lock);
|
||||
list_add(&msg->list_head, &session->s_cap_releases);
|
||||
session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
|
||||
}
|
||||
|
||||
if (partial) {
|
||||
head = partial->front.iov_base;
|
||||
num = le32_to_cpu(head->num);
|
||||
dout(" queueing partial %p with %d/%d\n", partial, num,
|
||||
(int)CEPH_CAPS_PER_RELEASE);
|
||||
list_move_tail(&partial->list_head,
|
||||
&session->s_cap_releases_done);
|
||||
session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
|
||||
}
|
||||
err = 0;
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
out_unlocked:
|
||||
return err;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
|
||||
static int check_caps_flush(struct ceph_mds_client *mdsc,
|
||||
u64 want_flush_tid)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
int ret;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (ci->i_flushing_caps)
|
||||
ret = ci->i_cap_flush_seq >= want_flush_seq;
|
||||
else
|
||||
ret = 1;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
struct rb_node *n;
|
||||
struct ceph_cap_flush *cf;
|
||||
int ret = 1;
|
||||
|
||||
spin_lock(&mdsc->cap_dirty_lock);
|
||||
n = rb_first(&mdsc->cap_flush_tree);
|
||||
cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
|
||||
if (cf && cf->tid <= want_flush_tid) {
|
||||
dout("check_caps_flush still flushing tid %llu <= %llu\n",
|
||||
cf->tid, want_flush_tid);
|
||||
ret = 0;
|
||||
}
|
||||
spin_unlock(&mdsc->cap_dirty_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* flush all dirty inode data to disk.
|
||||
*
|
||||
* returns true if we've flushed through want_flush_seq
|
||||
* returns true if we've flushed through want_flush_tid
|
||||
*/
|
||||
static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
|
||||
static void wait_caps_flush(struct ceph_mds_client *mdsc,
|
||||
u64 want_flush_tid, u64 want_snap_seq)
|
||||
{
|
||||
int mds;
|
||||
|
||||
dout("check_cap_flush want %lld\n", want_flush_seq);
|
||||
dout("check_caps_flush want %llu snap want %llu\n",
|
||||
want_flush_tid, want_snap_seq);
|
||||
mutex_lock(&mdsc->mutex);
|
||||
for (mds = 0; mds < mdsc->max_sessions; mds++) {
|
||||
for (mds = 0; mds < mdsc->max_sessions; ) {
|
||||
struct ceph_mds_session *session = mdsc->sessions[mds];
|
||||
struct inode *inode = NULL;
|
||||
|
||||
if (!session)
|
||||
if (!session) {
|
||||
mds++;
|
||||
continue;
|
||||
}
|
||||
get_session(session);
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
|
||||
mutex_lock(&session->s_mutex);
|
||||
if (!list_empty(&session->s_cap_flushing)) {
|
||||
struct ceph_inode_info *ci =
|
||||
list_entry(session->s_cap_flushing.next,
|
||||
struct ceph_inode_info,
|
||||
i_flushing_item);
|
||||
|
||||
if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
|
||||
dout("check_cap_flush still flushing %p "
|
||||
"seq %lld <= %lld to mds%d\n",
|
||||
&ci->vfs_inode, ci->i_cap_flush_seq,
|
||||
want_flush_seq, session->s_mds);
|
||||
if (!list_empty(&session->s_cap_snaps_flushing)) {
|
||||
struct ceph_cap_snap *capsnap =
|
||||
list_first_entry(&session->s_cap_snaps_flushing,
|
||||
struct ceph_cap_snap,
|
||||
flushing_item);
|
||||
struct ceph_inode_info *ci = capsnap->ci;
|
||||
if (!check_capsnap_flush(ci, want_snap_seq)) {
|
||||
dout("check_cap_flush still flushing snap %p "
|
||||
"follows %lld <= %lld to mds%d\n",
|
||||
&ci->vfs_inode, capsnap->follows,
|
||||
want_snap_seq, mds);
|
||||
inode = igrab(&ci->vfs_inode);
|
||||
}
|
||||
}
|
||||
@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
|
||||
|
||||
if (inode) {
|
||||
wait_event(mdsc->cap_flushing_wq,
|
||||
check_cap_flush(inode, want_flush_seq));
|
||||
check_capsnap_flush(ceph_inode(inode),
|
||||
want_snap_seq));
|
||||
iput(inode);
|
||||
} else {
|
||||
mds++;
|
||||
}
|
||||
|
||||
mutex_lock(&mdsc->mutex);
|
||||
}
|
||||
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
|
||||
|
||||
wait_event(mdsc->cap_flushing_wq,
|
||||
check_caps_flush(mdsc, want_flush_tid));
|
||||
|
||||
dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1557,62 +1576,76 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
|
||||
void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session)
|
||||
{
|
||||
struct ceph_msg *msg;
|
||||
struct ceph_msg *msg = NULL;
|
||||
struct ceph_mds_cap_release *head;
|
||||
struct ceph_mds_cap_item *item;
|
||||
struct ceph_cap *cap;
|
||||
LIST_HEAD(tmp_list);
|
||||
int num_cap_releases;
|
||||
|
||||
dout("send_cap_releases mds%d\n", session->s_mds);
|
||||
spin_lock(&session->s_cap_lock);
|
||||
while (!list_empty(&session->s_cap_releases_done)) {
|
||||
msg = list_first_entry(&session->s_cap_releases_done,
|
||||
struct ceph_msg, list_head);
|
||||
list_del_init(&msg->list_head);
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
again:
|
||||
list_splice_init(&session->s_cap_releases, &tmp_list);
|
||||
num_cap_releases = session->s_num_cap_releases;
|
||||
session->s_num_cap_releases = 0;
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
|
||||
while (!list_empty(&tmp_list)) {
|
||||
if (!msg) {
|
||||
msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
|
||||
PAGE_CACHE_SIZE, GFP_NOFS, false);
|
||||
if (!msg)
|
||||
goto out_err;
|
||||
head = msg->front.iov_base;
|
||||
head->num = cpu_to_le32(0);
|
||||
msg->front.iov_len = sizeof(*head);
|
||||
}
|
||||
cap = list_first_entry(&tmp_list, struct ceph_cap,
|
||||
session_caps);
|
||||
list_del(&cap->session_caps);
|
||||
num_cap_releases--;
|
||||
|
||||
head = msg->front.iov_base;
|
||||
le32_add_cpu(&head->num, 1);
|
||||
item = msg->front.iov_base + msg->front.iov_len;
|
||||
item->ino = cpu_to_le64(cap->cap_ino);
|
||||
item->cap_id = cpu_to_le64(cap->cap_id);
|
||||
item->migrate_seq = cpu_to_le32(cap->mseq);
|
||||
item->seq = cpu_to_le32(cap->issue_seq);
|
||||
msg->front.iov_len += sizeof(*item);
|
||||
|
||||
ceph_put_cap(mdsc, cap);
|
||||
|
||||
if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
|
||||
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
|
||||
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
|
||||
ceph_con_send(&session->s_con, msg);
|
||||
msg = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
BUG_ON(num_cap_releases != 0);
|
||||
|
||||
spin_lock(&session->s_cap_lock);
|
||||
if (!list_empty(&session->s_cap_releases))
|
||||
goto again;
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
|
||||
if (msg) {
|
||||
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
|
||||
dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
|
||||
ceph_con_send(&session->s_con, msg);
|
||||
spin_lock(&session->s_cap_lock);
|
||||
}
|
||||
return;
|
||||
out_err:
|
||||
pr_err("send_cap_releases mds%d, failed to allocate message\n",
|
||||
session->s_mds);
|
||||
spin_lock(&session->s_cap_lock);
|
||||
list_splice(&tmp_list, &session->s_cap_releases);
|
||||
session->s_num_cap_releases += num_cap_releases;
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
}
|
||||
|
||||
static void discard_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session)
|
||||
{
|
||||
struct ceph_msg *msg;
|
||||
struct ceph_mds_cap_release *head;
|
||||
unsigned num;
|
||||
|
||||
dout("discard_cap_releases mds%d\n", session->s_mds);
|
||||
|
||||
if (!list_empty(&session->s_cap_releases)) {
|
||||
/* zero out the in-progress message */
|
||||
msg = list_first_entry(&session->s_cap_releases,
|
||||
struct ceph_msg, list_head);
|
||||
head = msg->front.iov_base;
|
||||
num = le32_to_cpu(head->num);
|
||||
dout("discard_cap_releases mds%d %p %u\n",
|
||||
session->s_mds, msg, num);
|
||||
head->num = cpu_to_le32(0);
|
||||
msg->front.iov_len = sizeof(*head);
|
||||
session->s_num_cap_releases += num;
|
||||
}
|
||||
|
||||
/* requeue completed messages */
|
||||
while (!list_empty(&session->s_cap_releases_done)) {
|
||||
msg = list_first_entry(&session->s_cap_releases_done,
|
||||
struct ceph_msg, list_head);
|
||||
list_del_init(&msg->list_head);
|
||||
|
||||
head = msg->front.iov_base;
|
||||
num = le32_to_cpu(head->num);
|
||||
dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
|
||||
num);
|
||||
session->s_num_cap_releases += num;
|
||||
head->num = cpu_to_le32(0);
|
||||
msg->front.iov_len = sizeof(*head);
|
||||
list_add(&msg->list_head, &session->s_cap_releases);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* requests
|
||||
*/
|
||||
@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
|
||||
|
||||
order = get_order(size * num_entries);
|
||||
while (order >= 0) {
|
||||
rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
|
||||
rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL |
|
||||
__GFP_NOWARN,
|
||||
order);
|
||||
if (rinfo->dir_in)
|
||||
break;
|
||||
@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
|
||||
struct ceph_mds_request, r_node);
|
||||
}
|
||||
|
||||
static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
|
||||
static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
struct ceph_mds_request *req = __get_oldest_req(mdsc);
|
||||
|
||||
if (req)
|
||||
return req->r_tid;
|
||||
return 0;
|
||||
return mdsc->oldest_tid;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
|
||||
/* wait */
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
dout("do_request waiting\n");
|
||||
if (req->r_timeout) {
|
||||
err = (long)wait_for_completion_killable_timeout(
|
||||
&req->r_completion, req->r_timeout);
|
||||
if (err == 0)
|
||||
err = -EIO;
|
||||
} else if (req->r_wait_for_completion) {
|
||||
if (!req->r_timeout && req->r_wait_for_completion) {
|
||||
err = req->r_wait_for_completion(mdsc, req);
|
||||
} else {
|
||||
err = wait_for_completion_killable(&req->r_completion);
|
||||
long timeleft = wait_for_completion_killable_timeout(
|
||||
&req->r_completion,
|
||||
ceph_timeout_jiffies(req->r_timeout));
|
||||
if (timeleft > 0)
|
||||
err = 0;
|
||||
else if (!timeleft)
|
||||
err = -EIO; /* timed out */
|
||||
else
|
||||
err = timeleft; /* killed */
|
||||
}
|
||||
dout("do_request waited, got %d\n", err);
|
||||
mutex_lock(&mdsc->mutex);
|
||||
@ -2496,7 +2529,6 @@ out_err:
|
||||
}
|
||||
mutex_unlock(&mdsc->mutex);
|
||||
|
||||
ceph_add_cap_releases(mdsc, req->r_session);
|
||||
mutex_unlock(&session->s_mutex);
|
||||
|
||||
/* kick calling process */
|
||||
@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
|
||||
*/
|
||||
session->s_cap_reconnect = 1;
|
||||
/* drop old cap expires; we're about to reestablish that state */
|
||||
discard_cap_releases(mdsc, session);
|
||||
spin_unlock(&session->s_cap_lock);
|
||||
cleanup_cap_releases(mdsc, session);
|
||||
|
||||
/* trim unused caps to reduce MDS's cache rejoin time */
|
||||
if (mdsc->fsc->sb->s_root)
|
||||
@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
|
||||
|
||||
reply->hdr.data_len = cpu_to_le32(pagelist->length);
|
||||
ceph_msg_data_add_pagelist(reply, pagelist);
|
||||
|
||||
ceph_early_kick_flushing_caps(mdsc, session);
|
||||
|
||||
ceph_con_send(&session->s_con, reply);
|
||||
|
||||
mutex_unlock(&session->s_mutex);
|
||||
@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work)
|
||||
send_renew_caps(mdsc, s);
|
||||
else
|
||||
ceph_con_keepalive(&s->s_con);
|
||||
ceph_add_cap_releases(mdsc, s);
|
||||
if (s->s_state == CEPH_MDS_SESSION_OPEN ||
|
||||
s->s_state == CEPH_MDS_SESSION_HUNG)
|
||||
ceph_send_cap_releases(mdsc, s);
|
||||
@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
|
||||
atomic_set(&mdsc->num_sessions, 0);
|
||||
mdsc->max_sessions = 0;
|
||||
mdsc->stopping = 0;
|
||||
mdsc->last_snap_seq = 0;
|
||||
init_rwsem(&mdsc->snap_rwsem);
|
||||
mdsc->snap_realms = RB_ROOT;
|
||||
INIT_LIST_HEAD(&mdsc->snap_empty);
|
||||
spin_lock_init(&mdsc->snap_empty_lock);
|
||||
mdsc->last_tid = 0;
|
||||
mdsc->oldest_tid = 0;
|
||||
mdsc->request_tree = RB_ROOT;
|
||||
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
|
||||
mdsc->last_renew_caps = jiffies;
|
||||
@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
|
||||
spin_lock_init(&mdsc->cap_delay_lock);
|
||||
INIT_LIST_HEAD(&mdsc->snap_flush_list);
|
||||
spin_lock_init(&mdsc->snap_flush_lock);
|
||||
mdsc->cap_flush_seq = 0;
|
||||
mdsc->last_cap_flush_tid = 1;
|
||||
mdsc->cap_flush_tree = RB_ROOT;
|
||||
INIT_LIST_HEAD(&mdsc->cap_dirty);
|
||||
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
|
||||
mdsc->num_cap_flushing = 0;
|
||||
@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
|
||||
ceph_caps_init(mdsc);
|
||||
ceph_adjust_min_caps(mdsc, fsc->min_caps);
|
||||
|
||||
init_rwsem(&mdsc->pool_perm_rwsem);
|
||||
mdsc->pool_perm_tree = RB_ROOT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
|
||||
*/
|
||||
static void wait_requests(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
struct ceph_options *opts = mdsc->fsc->client->options;
|
||||
struct ceph_mds_request *req;
|
||||
struct ceph_fs_client *fsc = mdsc->fsc;
|
||||
|
||||
mutex_lock(&mdsc->mutex);
|
||||
if (__get_oldest_req(mdsc)) {
|
||||
@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
|
||||
|
||||
dout("wait_requests waiting for requests\n");
|
||||
wait_for_completion_timeout(&mdsc->safe_umount_waiters,
|
||||
fsc->client->options->mount_timeout * HZ);
|
||||
ceph_timeout_jiffies(opts->mount_timeout));
|
||||
|
||||
/* tear down remaining requests */
|
||||
mutex_lock(&mdsc->mutex);
|
||||
@ -3485,7 +3524,8 @@ restart:
|
||||
nextreq = rb_entry(n, struct ceph_mds_request, r_node);
|
||||
else
|
||||
nextreq = NULL;
|
||||
if ((req->r_op & CEPH_MDS_OP_WRITE)) {
|
||||
if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
|
||||
(req->r_op & CEPH_MDS_OP_WRITE)) {
|
||||
/* write op */
|
||||
ceph_mdsc_get_request(req);
|
||||
if (nextreq)
|
||||
@ -3513,7 +3553,7 @@ restart:
|
||||
|
||||
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
u64 want_tid, want_flush;
|
||||
u64 want_tid, want_flush, want_snap;
|
||||
|
||||
if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
|
||||
return;
|
||||
@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
|
||||
|
||||
ceph_flush_dirty_caps(mdsc);
|
||||
spin_lock(&mdsc->cap_dirty_lock);
|
||||
want_flush = mdsc->cap_flush_seq;
|
||||
want_flush = mdsc->last_cap_flush_tid;
|
||||
spin_unlock(&mdsc->cap_dirty_lock);
|
||||
|
||||
dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
|
||||
down_read(&mdsc->snap_rwsem);
|
||||
want_snap = mdsc->last_snap_seq;
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
|
||||
dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
|
||||
want_tid, want_flush, want_snap);
|
||||
|
||||
wait_unsafe_requests(mdsc, want_tid);
|
||||
wait_caps_flush(mdsc, want_flush);
|
||||
wait_caps_flush(mdsc, want_flush, want_snap);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
|
||||
*/
|
||||
void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
|
||||
{
|
||||
struct ceph_options *opts = mdsc->fsc->client->options;
|
||||
struct ceph_mds_session *session;
|
||||
int i;
|
||||
struct ceph_fs_client *fsc = mdsc->fsc;
|
||||
unsigned long timeout = fsc->client->options->mount_timeout * HZ;
|
||||
|
||||
dout("close_sessions\n");
|
||||
|
||||
@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
|
||||
|
||||
dout("waiting for sessions to close\n");
|
||||
wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
|
||||
timeout);
|
||||
ceph_timeout_jiffies(opts->mount_timeout));
|
||||
|
||||
/* tear down remaining sessions */
|
||||
mutex_lock(&mdsc->mutex);
|
||||
@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
|
||||
ceph_mdsmap_destroy(mdsc->mdsmap);
|
||||
kfree(mdsc->sessions);
|
||||
ceph_caps_finalize(mdsc);
|
||||
ceph_pool_perm_destroy(mdsc);
|
||||
}
|
||||
|
||||
void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
|
||||
|
@ -139,7 +139,6 @@ struct ceph_mds_session {
|
||||
int s_cap_reconnect;
|
||||
int s_readonly;
|
||||
struct list_head s_cap_releases; /* waiting cap_release messages */
|
||||
struct list_head s_cap_releases_done; /* ready to send */
|
||||
struct ceph_cap *s_cap_iterator;
|
||||
|
||||
/* protected by mutex */
|
||||
@ -228,7 +227,7 @@ struct ceph_mds_request {
|
||||
int r_err;
|
||||
bool r_aborted;
|
||||
|
||||
unsigned long r_timeout; /* optional. jiffies */
|
||||
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
|
||||
unsigned long r_started; /* start time to measure timeout against */
|
||||
unsigned long r_request_started; /* start time for mds request only,
|
||||
used to measure lease durations */
|
||||
@ -254,12 +253,21 @@ struct ceph_mds_request {
|
||||
bool r_got_unsafe, r_got_safe, r_got_result;
|
||||
|
||||
bool r_did_prepopulate;
|
||||
long long r_dir_release_cnt;
|
||||
long long r_dir_ordered_cnt;
|
||||
int r_readdir_cache_idx;
|
||||
u32 r_readdir_offset;
|
||||
|
||||
struct ceph_cap_reservation r_caps_reservation;
|
||||
int r_num_caps;
|
||||
};
|
||||
|
||||
struct ceph_pool_perm {
|
||||
struct rb_node node;
|
||||
u32 pool;
|
||||
int perm;
|
||||
};
|
||||
|
||||
/*
|
||||
* mds client state
|
||||
*/
|
||||
@ -284,12 +292,15 @@ struct ceph_mds_client {
|
||||
* references (implying they contain no inodes with caps) that
|
||||
* should be destroyed.
|
||||
*/
|
||||
u64 last_snap_seq;
|
||||
struct rw_semaphore snap_rwsem;
|
||||
struct rb_root snap_realms;
|
||||
struct list_head snap_empty;
|
||||
spinlock_t snap_empty_lock; /* protect snap_empty */
|
||||
|
||||
u64 last_tid; /* most recent mds request */
|
||||
u64 oldest_tid; /* oldest incomplete mds request,
|
||||
excluding setfilelock requests */
|
||||
struct rb_root request_tree; /* pending mds requests */
|
||||
struct delayed_work delayed_work; /* delayed work */
|
||||
unsigned long last_renew_caps; /* last time we renewed our caps */
|
||||
@ -298,7 +309,8 @@ struct ceph_mds_client {
|
||||
struct list_head snap_flush_list; /* cap_snaps ready to flush */
|
||||
spinlock_t snap_flush_lock;
|
||||
|
||||
u64 cap_flush_seq;
|
||||
u64 last_cap_flush_tid;
|
||||
struct rb_root cap_flush_tree;
|
||||
struct list_head cap_dirty; /* inodes with dirty caps */
|
||||
struct list_head cap_dirty_migrating; /* ...that are migration... */
|
||||
int num_cap_flushing; /* # caps we are flushing */
|
||||
@ -328,6 +340,9 @@ struct ceph_mds_client {
|
||||
spinlock_t dentry_lru_lock;
|
||||
struct list_head dentry_lru;
|
||||
int num_dentry;
|
||||
|
||||
struct rw_semaphore pool_perm_rwsem;
|
||||
struct rb_root pool_perm_tree;
|
||||
};
|
||||
|
||||
extern const char *ceph_mds_op_name(int op);
|
||||
@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
|
||||
kref_put(&req->r_kref, ceph_mdsc_release_request);
|
||||
}
|
||||
|
||||
extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
|
||||
|
181
fs/ceph/snap.c
181
fs/ceph/snap.c
@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
|
||||
}
|
||||
|
||||
|
||||
static struct ceph_snap_context *empty_snapc;
|
||||
struct ceph_snap_context *ceph_empty_snapc;
|
||||
|
||||
/*
|
||||
* build the snap context for a given realm.
|
||||
@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (num == 0 && realm->seq == empty_snapc->seq) {
|
||||
ceph_get_snap_context(empty_snapc);
|
||||
snapc = empty_snapc;
|
||||
if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
|
||||
ceph_get_snap_context(ceph_empty_snapc);
|
||||
snapc = ceph_empty_snapc;
|
||||
goto done;
|
||||
}
|
||||
|
||||
@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool has_new_snaps(struct ceph_snap_context *o,
|
||||
struct ceph_snap_context *n)
|
||||
{
|
||||
if (n->num_snaps == 0)
|
||||
return false;
|
||||
/* snaps are in descending order */
|
||||
return n->snaps[0] > o->seq;
|
||||
}
|
||||
|
||||
/*
|
||||
* When a snapshot is applied, the size/mtime inode metadata is queued
|
||||
@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
|
||||
{
|
||||
struct inode *inode = &ci->vfs_inode;
|
||||
struct ceph_cap_snap *capsnap;
|
||||
struct ceph_snap_context *old_snapc, *new_snapc;
|
||||
int used, dirty;
|
||||
|
||||
capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
|
||||
@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
|
||||
used = __ceph_caps_used(ci);
|
||||
dirty = __ceph_caps_dirty(ci);
|
||||
|
||||
old_snapc = ci->i_head_snapc;
|
||||
new_snapc = ci->i_snap_realm->cached_context;
|
||||
|
||||
/*
|
||||
* If there is a write in progress, treat that as a dirty Fw,
|
||||
* even though it hasn't completed yet; by the time we finish
|
||||
@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
|
||||
writes in progress now were started before the previous
|
||||
cap_snap. lucky us. */
|
||||
dout("queue_cap_snap %p already pending\n", inode);
|
||||
kfree(capsnap);
|
||||
} else if (ci->i_snap_realm->cached_context == empty_snapc) {
|
||||
dout("queue_cap_snap %p empty snapc\n", inode);
|
||||
kfree(capsnap);
|
||||
} else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
|
||||
CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
|
||||
struct ceph_snap_context *snapc = ci->i_head_snapc;
|
||||
|
||||
/*
|
||||
* if we are a sync write, we may need to go to the snaprealm
|
||||
* to get the current snapc.
|
||||
*/
|
||||
if (!snapc)
|
||||
snapc = ci->i_snap_realm->cached_context;
|
||||
|
||||
dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
|
||||
inode, capsnap, snapc, ceph_cap_string(dirty));
|
||||
ihold(inode);
|
||||
|
||||
atomic_set(&capsnap->nref, 1);
|
||||
capsnap->ci = ci;
|
||||
INIT_LIST_HEAD(&capsnap->ci_item);
|
||||
INIT_LIST_HEAD(&capsnap->flushing_item);
|
||||
|
||||
capsnap->follows = snapc->seq;
|
||||
capsnap->issued = __ceph_caps_issued(ci, NULL);
|
||||
capsnap->dirty = dirty;
|
||||
|
||||
capsnap->mode = inode->i_mode;
|
||||
capsnap->uid = inode->i_uid;
|
||||
capsnap->gid = inode->i_gid;
|
||||
|
||||
if (dirty & CEPH_CAP_XATTR_EXCL) {
|
||||
__ceph_build_xattrs_blob(ci);
|
||||
capsnap->xattr_blob =
|
||||
ceph_buffer_get(ci->i_xattrs.blob);
|
||||
capsnap->xattr_version = ci->i_xattrs.version;
|
||||
} else {
|
||||
capsnap->xattr_blob = NULL;
|
||||
capsnap->xattr_version = 0;
|
||||
}
|
||||
|
||||
capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
|
||||
|
||||
/* dirty page count moved from _head to this cap_snap;
|
||||
all subsequent writes page dirties occur _after_ this
|
||||
snapshot. */
|
||||
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
|
||||
ci->i_wrbuffer_ref_head = 0;
|
||||
capsnap->context = snapc;
|
||||
ci->i_head_snapc =
|
||||
ceph_get_snap_context(ci->i_snap_realm->cached_context);
|
||||
dout(" new snapc is %p\n", ci->i_head_snapc);
|
||||
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
|
||||
|
||||
if (used & CEPH_CAP_FILE_WR) {
|
||||
dout("queue_cap_snap %p cap_snap %p snapc %p"
|
||||
" seq %llu used WR, now pending\n", inode,
|
||||
capsnap, snapc, snapc->seq);
|
||||
capsnap->writing = 1;
|
||||
} else {
|
||||
/* note mtime, size NOW. */
|
||||
__ceph_finish_cap_snap(ci, capsnap);
|
||||
}
|
||||
} else {
|
||||
goto update_snapc;
|
||||
}
|
||||
if (ci->i_wrbuffer_ref_head == 0 &&
|
||||
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
|
||||
dout("queue_cap_snap %p nothing dirty|writing\n", inode);
|
||||
kfree(capsnap);
|
||||
goto update_snapc;
|
||||
}
|
||||
|
||||
BUG_ON(!old_snapc);
|
||||
|
||||
/*
|
||||
* There is no need to send FLUSHSNAP message to MDS if there is
|
||||
* no new snapshot. But when there is dirty pages or on-going
|
||||
* writes, we still need to create cap_snap. cap_snap is needed
|
||||
* by the write path and page writeback path.
|
||||
*
|
||||
* also see ceph_try_drop_cap_snap()
|
||||
*/
|
||||
if (has_new_snaps(old_snapc, new_snapc)) {
|
||||
if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
|
||||
capsnap->need_flush = true;
|
||||
} else {
|
||||
if (!(used & CEPH_CAP_FILE_WR) &&
|
||||
ci->i_wrbuffer_ref_head == 0) {
|
||||
dout("queue_cap_snap %p "
|
||||
"no new_snap|dirty_page|writing\n", inode);
|
||||
goto update_snapc;
|
||||
}
|
||||
}
|
||||
|
||||
dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
|
||||
inode, capsnap, old_snapc, ceph_cap_string(dirty),
|
||||
capsnap->need_flush ? "" : "no_flush");
|
||||
ihold(inode);
|
||||
|
||||
atomic_set(&capsnap->nref, 1);
|
||||
capsnap->ci = ci;
|
||||
INIT_LIST_HEAD(&capsnap->ci_item);
|
||||
INIT_LIST_HEAD(&capsnap->flushing_item);
|
||||
|
||||
capsnap->follows = old_snapc->seq;
|
||||
capsnap->issued = __ceph_caps_issued(ci, NULL);
|
||||
capsnap->dirty = dirty;
|
||||
|
||||
capsnap->mode = inode->i_mode;
|
||||
capsnap->uid = inode->i_uid;
|
||||
capsnap->gid = inode->i_gid;
|
||||
|
||||
if (dirty & CEPH_CAP_XATTR_EXCL) {
|
||||
__ceph_build_xattrs_blob(ci);
|
||||
capsnap->xattr_blob =
|
||||
ceph_buffer_get(ci->i_xattrs.blob);
|
||||
capsnap->xattr_version = ci->i_xattrs.version;
|
||||
} else {
|
||||
capsnap->xattr_blob = NULL;
|
||||
capsnap->xattr_version = 0;
|
||||
}
|
||||
|
||||
capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
|
||||
|
||||
/* dirty page count moved from _head to this cap_snap;
|
||||
all subsequent writes page dirties occur _after_ this
|
||||
snapshot. */
|
||||
capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
|
||||
ci->i_wrbuffer_ref_head = 0;
|
||||
capsnap->context = old_snapc;
|
||||
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
|
||||
old_snapc = NULL;
|
||||
|
||||
if (used & CEPH_CAP_FILE_WR) {
|
||||
dout("queue_cap_snap %p cap_snap %p snapc %p"
|
||||
" seq %llu used WR, now pending\n", inode,
|
||||
capsnap, old_snapc, old_snapc->seq);
|
||||
capsnap->writing = 1;
|
||||
} else {
|
||||
/* note mtime, size NOW. */
|
||||
__ceph_finish_cap_snap(ci, capsnap);
|
||||
}
|
||||
capsnap = NULL;
|
||||
|
||||
update_snapc:
|
||||
if (ci->i_head_snapc) {
|
||||
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
|
||||
dout(" new snapc is %p\n", new_snapc);
|
||||
}
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
|
||||
kfree(capsnap);
|
||||
ceph_put_snap_context(old_snapc);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -699,6 +730,8 @@ more:
|
||||
|
||||
/* queue realm for cap_snap creation */
|
||||
list_add(&realm->dirty_item, &dirty_realms);
|
||||
if (realm->seq > mdsc->last_snap_seq)
|
||||
mdsc->last_snap_seq = realm->seq;
|
||||
|
||||
invalidate = 1;
|
||||
} else if (!realm->cached_context) {
|
||||
@ -964,14 +997,14 @@ out:
|
||||
|
||||
int __init ceph_snap_init(void)
|
||||
{
|
||||
empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
|
||||
if (!empty_snapc)
|
||||
ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
|
||||
if (!ceph_empty_snapc)
|
||||
return -ENOMEM;
|
||||
empty_snapc->seq = 1;
|
||||
ceph_empty_snapc->seq = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ceph_snap_exit(void)
|
||||
{
|
||||
ceph_put_snap_context(empty_snapc);
|
||||
ceph_put_snap_context(ceph_empty_snapc);
|
||||
}
|
||||
|
@ -134,10 +134,12 @@ enum {
|
||||
Opt_noino32,
|
||||
Opt_fscache,
|
||||
Opt_nofscache,
|
||||
Opt_poolperm,
|
||||
Opt_nopoolperm,
|
||||
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||
Opt_acl,
|
||||
#endif
|
||||
Opt_noacl
|
||||
Opt_noacl,
|
||||
};
|
||||
|
||||
static match_table_t fsopt_tokens = {
|
||||
@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = {
|
||||
{Opt_noino32, "noino32"},
|
||||
{Opt_fscache, "fsc"},
|
||||
{Opt_nofscache, "nofsc"},
|
||||
{Opt_poolperm, "poolperm"},
|
||||
{Opt_nopoolperm, "nopoolperm"},
|
||||
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||
{Opt_acl, "acl"},
|
||||
#endif
|
||||
@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private)
|
||||
case Opt_nofscache:
|
||||
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
|
||||
break;
|
||||
case Opt_poolperm:
|
||||
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM;
|
||||
printk ("pool perm");
|
||||
break;
|
||||
case Opt_nopoolperm:
|
||||
fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
|
||||
break;
|
||||
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||
case Opt_acl:
|
||||
fsopt->sb_flags |= MS_POSIXACL;
|
||||
@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
|
||||
seq_puts(m, ",nodcache");
|
||||
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
|
||||
seq_puts(m, ",fsc");
|
||||
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
|
||||
seq_puts(m, ",nopoolperm");
|
||||
|
||||
#ifdef CONFIG_CEPH_FS_POSIX_ACL
|
||||
if (fsopt->sb_flags & MS_POSIXACL)
|
||||
@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
|
||||
*/
|
||||
struct kmem_cache *ceph_inode_cachep;
|
||||
struct kmem_cache *ceph_cap_cachep;
|
||||
struct kmem_cache *ceph_cap_flush_cachep;
|
||||
struct kmem_cache *ceph_dentry_cachep;
|
||||
struct kmem_cache *ceph_file_cachep;
|
||||
|
||||
@ -634,6 +648,10 @@ static int __init init_caches(void)
|
||||
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
|
||||
if (ceph_cap_cachep == NULL)
|
||||
goto bad_cap;
|
||||
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
|
||||
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
|
||||
if (ceph_cap_flush_cachep == NULL)
|
||||
goto bad_cap_flush;
|
||||
|
||||
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
|
||||
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
|
||||
@ -652,6 +670,8 @@ static int __init init_caches(void)
|
||||
bad_file:
|
||||
kmem_cache_destroy(ceph_dentry_cachep);
|
||||
bad_dentry:
|
||||
kmem_cache_destroy(ceph_cap_flush_cachep);
|
||||
bad_cap_flush:
|
||||
kmem_cache_destroy(ceph_cap_cachep);
|
||||
bad_cap:
|
||||
kmem_cache_destroy(ceph_inode_cachep);
|
||||
@ -668,6 +688,7 @@ static void destroy_caches(void)
|
||||
|
||||
kmem_cache_destroy(ceph_inode_cachep);
|
||||
kmem_cache_destroy(ceph_cap_cachep);
|
||||
kmem_cache_destroy(ceph_cap_flush_cachep);
|
||||
kmem_cache_destroy(ceph_dentry_cachep);
|
||||
kmem_cache_destroy(ceph_file_cachep);
|
||||
|
||||
@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
|
||||
req->r_ino1.ino = CEPH_INO_ROOT;
|
||||
req->r_ino1.snap = CEPH_NOSNAP;
|
||||
req->r_started = started;
|
||||
req->r_timeout = fsc->client->options->mount_timeout * HZ;
|
||||
req->r_timeout = fsc->client->options->mount_timeout;
|
||||
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
|
||||
req->r_num_caps = 2;
|
||||
err = ceph_mdsc_do_request(mdsc, NULL, req);
|
||||
|
125
fs/ceph/super.h
125
fs/ceph/super.h
@ -35,6 +35,7 @@
|
||||
#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
|
||||
#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
|
||||
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
|
||||
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
|
||||
|
||||
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \
|
||||
CEPH_MOUNT_OPT_DCACHE)
|
||||
@ -121,11 +122,21 @@ struct ceph_cap {
|
||||
struct rb_node ci_node; /* per-ci cap tree */
|
||||
struct ceph_mds_session *session;
|
||||
struct list_head session_caps; /* per-session caplist */
|
||||
int mds;
|
||||
u64 cap_id; /* unique cap id (mds provided) */
|
||||
int issued; /* latest, from the mds */
|
||||
int implemented; /* implemented superset of issued (for revocation) */
|
||||
int mds_wanted;
|
||||
union {
|
||||
/* in-use caps */
|
||||
struct {
|
||||
int issued; /* latest, from the mds */
|
||||
int implemented; /* implemented superset of
|
||||
issued (for revocation) */
|
||||
int mds, mds_wanted;
|
||||
};
|
||||
/* caps to release */
|
||||
struct {
|
||||
u64 cap_ino;
|
||||
int queue_release;
|
||||
};
|
||||
};
|
||||
u32 seq, issue_seq, mseq;
|
||||
u32 cap_gen; /* active/stale cycle */
|
||||
unsigned long last_used;
|
||||
@ -163,6 +174,7 @@ struct ceph_cap_snap {
|
||||
int writing; /* a sync write is still in progress */
|
||||
int dirty_pages; /* dirty pages awaiting writeback */
|
||||
bool inline_data;
|
||||
bool need_flush;
|
||||
};
|
||||
|
||||
static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
|
||||
@ -174,6 +186,17 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
|
||||
}
|
||||
}
|
||||
|
||||
struct ceph_cap_flush {
|
||||
u64 tid;
|
||||
int caps;
|
||||
bool kick;
|
||||
struct rb_node g_node; // global
|
||||
union {
|
||||
struct rb_node i_node; // inode
|
||||
struct list_head list;
|
||||
};
|
||||
};
|
||||
|
||||
/*
|
||||
* The frag tree describes how a directory is fragmented, potentially across
|
||||
* multiple metadata servers. It is also used to indicate points where
|
||||
@ -259,9 +282,9 @@ struct ceph_inode_info {
|
||||
u32 i_time_warp_seq;
|
||||
|
||||
unsigned i_ceph_flags;
|
||||
int i_ordered_count;
|
||||
atomic_t i_release_count;
|
||||
atomic_t i_complete_count;
|
||||
atomic64_t i_release_count;
|
||||
atomic64_t i_ordered_count;
|
||||
atomic64_t i_complete_seq[2];
|
||||
|
||||
struct ceph_dir_layout i_dir_layout;
|
||||
struct ceph_file_layout i_layout;
|
||||
@ -283,11 +306,11 @@ struct ceph_inode_info {
|
||||
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
|
||||
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
|
||||
struct list_head i_dirty_item, i_flushing_item;
|
||||
u64 i_cap_flush_seq;
|
||||
/* we need to track cap writeback on a per-cap-bit basis, to allow
|
||||
* overlapping, pipelined cap flushes to the mds. we can probably
|
||||
* reduce the tid to 8 bits if we're concerned about inode size. */
|
||||
u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
|
||||
struct ceph_cap_flush *i_prealloc_cap_flush;
|
||||
struct rb_root i_cap_flush_tree;
|
||||
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
|
||||
unsigned long i_hold_caps_min; /* jiffies */
|
||||
unsigned long i_hold_caps_max; /* jiffies */
|
||||
@ -438,36 +461,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
|
||||
/*
|
||||
* Ceph inode.
|
||||
*/
|
||||
#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */
|
||||
#define CEPH_I_NODELAY 4 /* do not delay cap release */
|
||||
#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
|
||||
#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
|
||||
#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
|
||||
#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
|
||||
#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
|
||||
#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */
|
||||
#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
|
||||
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
|
||||
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
|
||||
|
||||
|
||||
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
|
||||
int release_count, int ordered_count)
|
||||
long long release_count,
|
||||
long long ordered_count)
|
||||
{
|
||||
atomic_set(&ci->i_complete_count, release_count);
|
||||
if (ci->i_ordered_count == ordered_count)
|
||||
ci->i_ceph_flags |= CEPH_I_DIR_ORDERED;
|
||||
else
|
||||
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
|
||||
smp_mb__before_atomic();
|
||||
atomic64_set(&ci->i_complete_seq[0], release_count);
|
||||
atomic64_set(&ci->i_complete_seq[1], ordered_count);
|
||||
}
|
||||
|
||||
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
|
||||
{
|
||||
atomic_inc(&ci->i_release_count);
|
||||
atomic64_inc(&ci->i_release_count);
|
||||
}
|
||||
|
||||
static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
|
||||
{
|
||||
atomic64_inc(&ci->i_ordered_count);
|
||||
}
|
||||
|
||||
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
|
||||
{
|
||||
return atomic_read(&ci->i_complete_count) ==
|
||||
atomic_read(&ci->i_release_count);
|
||||
return atomic64_read(&ci->i_complete_seq[0]) ==
|
||||
atomic64_read(&ci->i_release_count);
|
||||
}
|
||||
|
||||
static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
|
||||
{
|
||||
return __ceph_dir_is_complete(ci) &&
|
||||
(ci->i_ceph_flags & CEPH_I_DIR_ORDERED);
|
||||
return atomic64_read(&ci->i_complete_seq[0]) ==
|
||||
atomic64_read(&ci->i_release_count) &&
|
||||
atomic64_read(&ci->i_complete_seq[1]) ==
|
||||
atomic64_read(&ci->i_ordered_count);
|
||||
}
|
||||
|
||||
static inline void ceph_dir_clear_complete(struct inode *inode)
|
||||
@ -477,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
|
||||
|
||||
static inline void ceph_dir_clear_ordered(struct inode *inode)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
ci->i_ordered_count++;
|
||||
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
__ceph_dir_clear_ordered(ceph_inode(inode));
|
||||
}
|
||||
|
||||
static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
|
||||
{
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
bool ret;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
ret = __ceph_dir_is_complete_ordered(ci);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
|
||||
smp_rmb();
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -552,7 +578,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
|
||||
{
|
||||
return ci->i_dirty_caps | ci->i_flushing_caps;
|
||||
}
|
||||
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
|
||||
extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
|
||||
extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
|
||||
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
|
||||
struct ceph_cap_flush **pcf);
|
||||
|
||||
extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
|
||||
struct ceph_cap *ocap, int mask);
|
||||
@ -606,16 +635,20 @@ struct ceph_file_info {
|
||||
unsigned offset; /* offset of last chunk, adjusted for . and .. */
|
||||
unsigned next_offset; /* offset of next chunk (last_name's + 1) */
|
||||
char *last_name; /* last entry in previous chunk */
|
||||
struct dentry *dentry; /* next dentry (for dcache readdir) */
|
||||
int dir_release_count;
|
||||
int dir_ordered_count;
|
||||
long long dir_release_count;
|
||||
long long dir_ordered_count;
|
||||
int readdir_cache_idx;
|
||||
|
||||
/* used for -o dirstat read() on directory thing */
|
||||
char *dir_info;
|
||||
int dir_info_len;
|
||||
};
|
||||
|
||||
|
||||
struct ceph_readdir_cache_control {
|
||||
struct page *page;
|
||||
struct dentry **dentries;
|
||||
int index;
|
||||
};
|
||||
|
||||
/*
|
||||
* A "snap realm" describes a subset of the file hierarchy sharing
|
||||
@ -687,6 +720,7 @@ static inline int default_congestion_kb(void)
|
||||
|
||||
|
||||
/* snap.c */
|
||||
extern struct ceph_snap_context *ceph_empty_snapc;
|
||||
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
|
||||
u64 ino);
|
||||
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
|
||||
@ -713,8 +747,8 @@ extern void ceph_snap_exit(void);
|
||||
static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
|
||||
{
|
||||
return !list_empty(&ci->i_cap_snaps) &&
|
||||
list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
|
||||
ci_item)->writing;
|
||||
list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap,
|
||||
ci_item)->writing;
|
||||
}
|
||||
|
||||
/* inode.c */
|
||||
@ -838,12 +872,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc,
|
||||
struct ceph_cap *cap);
|
||||
extern int ceph_is_any_caps(struct inode *inode);
|
||||
|
||||
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
|
||||
u64 cap_id, u32 migrate_seq, u32 issue_seq);
|
||||
extern void ceph_queue_caps_release(struct inode *inode);
|
||||
extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
|
||||
extern int ceph_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync);
|
||||
extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_session *session);
|
||||
extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
|
||||
@ -879,6 +913,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
|
||||
/* addr.c */
|
||||
extern const struct address_space_operations ceph_aops;
|
||||
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
|
||||
extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
|
||||
extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need);
|
||||
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
|
||||
|
||||
/* file.c */
|
||||
extern const struct file_operations ceph_file_fops;
|
||||
@ -890,7 +927,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
|
||||
extern int ceph_release(struct inode *inode, struct file *filp);
|
||||
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
|
||||
char *data, size_t len);
|
||||
int ceph_uninline_data(struct file *filp, struct page *locked_page);
|
||||
/* dir.c */
|
||||
extern const struct file_operations ceph_dir_fops;
|
||||
extern const struct file_operations ceph_snapdir_fops;
|
||||
@ -911,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
|
||||
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
|
||||
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
|
||||
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
|
||||
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
|
||||
|
||||
/*
|
||||
* our d_ops vary depending on whether the inode is live,
|
||||
|
@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
|
||||
struct inode *inode = d_inode(dentry);
|
||||
struct ceph_vxattr *vxattr;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
|
||||
struct ceph_cap_flush *prealloc_cf = NULL;
|
||||
int issued;
|
||||
int err;
|
||||
int dirty = 0;
|
||||
@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
|
||||
char *newval = NULL;
|
||||
struct ceph_inode_xattr *xattr = NULL;
|
||||
int required_blob_size;
|
||||
bool lock_snap_rwsem = false;
|
||||
|
||||
if (!ceph_is_valid_xattr(name))
|
||||
return -EOPNOTSUPP;
|
||||
@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
|
||||
if (!xattr)
|
||||
goto out;
|
||||
|
||||
prealloc_cf = ceph_alloc_cap_flush();
|
||||
if (!prealloc_cf)
|
||||
goto out;
|
||||
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
retry:
|
||||
issued = __ceph_caps_issued(ci, NULL);
|
||||
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
|
||||
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
|
||||
goto do_sync;
|
||||
|
||||
if (!lock_snap_rwsem && !ci->i_head_snapc) {
|
||||
lock_snap_rwsem = true;
|
||||
if (!down_read_trylock(&mdsc->snap_rwsem)) {
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
down_read(&mdsc->snap_rwsem);
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
|
||||
__build_xattrs(inode);
|
||||
|
||||
required_blob_size = __get_required_blob_size(ci, name_len, val_len);
|
||||
@ -966,7 +984,7 @@ retry:
|
||||
dout(" preaallocating new blob size=%d\n", required_blob_size);
|
||||
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
|
||||
if (!blob)
|
||||
goto out;
|
||||
goto do_sync_unlocked;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (ci->i_xattrs.prealloc_blob)
|
||||
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
|
||||
@ -978,21 +996,28 @@ retry:
|
||||
flags, value ? 1 : -1, &xattr);
|
||||
|
||||
if (!err) {
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
|
||||
&prealloc_cf);
|
||||
ci->i_xattrs.dirty = true;
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
}
|
||||
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
if (lock_snap_rwsem)
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
if (dirty)
|
||||
__mark_inode_dirty(inode, dirty);
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
return err;
|
||||
|
||||
do_sync:
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
do_sync_unlocked:
|
||||
if (lock_snap_rwsem)
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
err = ceph_sync_setxattr(dentry, name, value, size, flags);
|
||||
out:
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
kfree(newname);
|
||||
kfree(newval);
|
||||
kfree(xattr);
|
||||
@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
|
||||
struct inode *inode = d_inode(dentry);
|
||||
struct ceph_vxattr *vxattr;
|
||||
struct ceph_inode_info *ci = ceph_inode(inode);
|
||||
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
|
||||
struct ceph_cap_flush *prealloc_cf = NULL;
|
||||
int issued;
|
||||
int err;
|
||||
int required_blob_size;
|
||||
int dirty;
|
||||
bool lock_snap_rwsem = false;
|
||||
|
||||
if (!ceph_is_valid_xattr(name))
|
||||
return -EOPNOTSUPP;
|
||||
@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
|
||||
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
|
||||
goto do_sync_unlocked;
|
||||
|
||||
prealloc_cf = ceph_alloc_cap_flush();
|
||||
if (!prealloc_cf)
|
||||
return -ENOMEM;
|
||||
|
||||
err = -ENOMEM;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
retry:
|
||||
issued = __ceph_caps_issued(ci, NULL);
|
||||
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
|
||||
|
||||
if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
|
||||
goto do_sync;
|
||||
|
||||
if (!lock_snap_rwsem && !ci->i_head_snapc) {
|
||||
lock_snap_rwsem = true;
|
||||
if (!down_read_trylock(&mdsc->snap_rwsem)) {
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
down_read(&mdsc->snap_rwsem);
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
|
||||
|
||||
__build_xattrs(inode);
|
||||
|
||||
required_blob_size = __get_required_blob_size(ci, 0, 0);
|
||||
@ -1080,7 +1123,7 @@ retry:
|
||||
dout(" preaallocating new blob size=%d\n", required_blob_size);
|
||||
blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
|
||||
if (!blob)
|
||||
goto out;
|
||||
goto do_sync_unlocked;
|
||||
spin_lock(&ci->i_ceph_lock);
|
||||
if (ci->i_xattrs.prealloc_blob)
|
||||
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
|
||||
@ -1090,18 +1133,24 @@ retry:
|
||||
|
||||
err = __remove_xattr_by_name(ceph_inode(inode), name);
|
||||
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
|
||||
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
|
||||
&prealloc_cf);
|
||||
ci->i_xattrs.dirty = true;
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
if (lock_snap_rwsem)
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
if (dirty)
|
||||
__mark_inode_dirty(inode, dirty);
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
return err;
|
||||
do_sync:
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
do_sync_unlocked:
|
||||
if (lock_snap_rwsem)
|
||||
up_read(&mdsc->snap_rwsem);
|
||||
ceph_free_cap_flush(prealloc_cf);
|
||||
err = ceph_send_removexattr(dentry, name);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -43,9 +43,9 @@ struct ceph_options {
|
||||
int flags;
|
||||
struct ceph_fsid fsid;
|
||||
struct ceph_entity_addr my_addr;
|
||||
int mount_timeout;
|
||||
int osd_idle_ttl;
|
||||
int osd_keepalive_timeout;
|
||||
unsigned long mount_timeout; /* jiffies */
|
||||
unsigned long osd_idle_ttl; /* jiffies */
|
||||
unsigned long osd_keepalive_timeout; /* jiffies */
|
||||
|
||||
/*
|
||||
* any type that can't be simply compared or doesn't need need
|
||||
@ -63,9 +63,9 @@ struct ceph_options {
|
||||
/*
|
||||
* defaults
|
||||
*/
|
||||
#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
|
||||
#define CEPH_OSD_KEEPALIVE_DEFAULT 5
|
||||
#define CEPH_OSD_IDLE_TTL_DEFAULT 60
|
||||
#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
|
||||
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
|
||||
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
|
||||
|
||||
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
|
||||
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
|
||||
@ -93,13 +93,9 @@ enum {
|
||||
CEPH_MOUNT_SHUTDOWN,
|
||||
};
|
||||
|
||||
/*
|
||||
* subtract jiffies
|
||||
*/
|
||||
static inline unsigned long time_sub(unsigned long a, unsigned long b)
|
||||
static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
|
||||
{
|
||||
BUG_ON(time_after(b, a));
|
||||
return (long)a - (long)b;
|
||||
return timeout ?: MAX_SCHEDULE_TIMEOUT;
|
||||
}
|
||||
|
||||
struct ceph_mds_client;
|
||||
@ -178,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
|
||||
|
||||
extern struct kmem_cache *ceph_inode_cachep;
|
||||
extern struct kmem_cache *ceph_cap_cachep;
|
||||
extern struct kmem_cache *ceph_cap_flush_cachep;
|
||||
extern struct kmem_cache *ceph_dentry_cachep;
|
||||
extern struct kmem_cache *ceph_file_cachep;
|
||||
|
||||
|
@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
|
||||
struct ceph_msg *msg);
|
||||
|
||||
extern void osd_req_op_init(struct ceph_osd_request *osd_req,
|
||||
unsigned int which, u16 opcode);
|
||||
unsigned int which, u16 opcode, u32 flags);
|
||||
|
||||
extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
|
||||
unsigned int which,
|
||||
|
@ -1,7 +1,11 @@
|
||||
#ifndef CEPH_CRUSH_CRUSH_H
|
||||
#define CEPH_CRUSH_CRUSH_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#ifdef __KERNEL__
|
||||
# include <linux/types.h>
|
||||
#else
|
||||
# include "crush_compat.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* CRUSH is a pseudo-random data distribution algorithm that
|
||||
@ -20,7 +24,11 @@
|
||||
#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
|
||||
|
||||
#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
|
||||
#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
|
||||
#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
|
||||
|
||||
#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
|
||||
#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
|
||||
|
||||
#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
|
||||
#define CRUSH_ITEM_NONE 0x7fffffff /* no result */
|
||||
@ -108,6 +116,15 @@ enum {
|
||||
};
|
||||
extern const char *crush_bucket_alg_name(int alg);
|
||||
|
||||
/*
|
||||
* although tree was a legacy algorithm, it has been buggy, so
|
||||
* exclude it.
|
||||
*/
|
||||
#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
|
||||
(1 << CRUSH_BUCKET_UNIFORM) | \
|
||||
(1 << CRUSH_BUCKET_LIST) | \
|
||||
(1 << CRUSH_BUCKET_STRAW))
|
||||
|
||||
struct crush_bucket {
|
||||
__s32 id; /* this'll be negative */
|
||||
__u16 type; /* non-zero; type=0 is reserved for devices */
|
||||
@ -174,7 +191,7 @@ struct crush_map {
|
||||
/* choose local attempts using a fallback permutation before
|
||||
* re-descent */
|
||||
__u32 choose_local_fallback_tries;
|
||||
/* choose attempts before giving up */
|
||||
/* choose attempts before giving up */
|
||||
__u32 choose_total_tries;
|
||||
/* attempt chooseleaf inner descent once for firstn mode; on
|
||||
* reject retry outer descent. Note that this does *not*
|
||||
@ -187,6 +204,25 @@ struct crush_map {
|
||||
* that want to limit reshuffling, a value of 3 or 4 will make the
|
||||
* mappings line up a bit better with previous mappings. */
|
||||
__u8 chooseleaf_vary_r;
|
||||
|
||||
#ifndef __KERNEL__
|
||||
/*
|
||||
* version 0 (original) of straw_calc has various flaws. version 1
|
||||
* fixes a few of them.
|
||||
*/
|
||||
__u8 straw_calc_version;
|
||||
|
||||
/*
|
||||
* allowed bucket algs is a bitmask, here the bit positions
|
||||
* are CRUSH_BUCKET_*. note that these are *bits* and
|
||||
* CRUSH_BUCKET_* values are not, so we need to or together (1
|
||||
* << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
|
||||
* minimize confusion (bucket type values start at 1).
|
||||
*/
|
||||
__u32 allowed_bucket_algs;
|
||||
|
||||
__u32 *choose_tries;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
@ -1,6 +1,12 @@
|
||||
#ifndef CEPH_CRUSH_HASH_H
|
||||
#define CEPH_CRUSH_HASH_H
|
||||
|
||||
#ifdef __KERNEL__
|
||||
# include <linux/types.h>
|
||||
#else
|
||||
# include "crush_compat.h"
|
||||
#endif
|
||||
|
||||
#define CRUSH_HASH_RJENKINS1 0
|
||||
|
||||
#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
|
||||
|
@ -8,7 +8,7 @@
|
||||
* LGPL2
|
||||
*/
|
||||
|
||||
#include <linux/crush/crush.h>
|
||||
#include "crush.h"
|
||||
|
||||
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
|
||||
extern int crush_do_rule(const struct crush_map *map,
|
||||
|
@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
|
||||
/* start with defaults */
|
||||
opt->flags = CEPH_OPT_DEFAULT;
|
||||
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
|
||||
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
|
||||
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
|
||||
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
|
||||
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
|
||||
|
||||
/* get mon ip(s) */
|
||||
/* ip1[:port1][,ip2[:port2]...] */
|
||||
@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
|
||||
pr_warn("ignoring deprecated osdtimeout option\n");
|
||||
break;
|
||||
case Opt_osdkeepalivetimeout:
|
||||
opt->osd_keepalive_timeout = intval;
|
||||
/* 0 isn't well defined right now, reject it */
|
||||
if (intval < 1 || intval > INT_MAX / 1000) {
|
||||
pr_err("osdkeepalive out of range\n");
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
opt->osd_keepalive_timeout =
|
||||
msecs_to_jiffies(intval * 1000);
|
||||
break;
|
||||
case Opt_osd_idle_ttl:
|
||||
opt->osd_idle_ttl = intval;
|
||||
/* 0 isn't well defined right now, reject it */
|
||||
if (intval < 1 || intval > INT_MAX / 1000) {
|
||||
pr_err("osd_idle_ttl out of range\n");
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
|
||||
break;
|
||||
case Opt_mount_timeout:
|
||||
opt->mount_timeout = intval;
|
||||
/* 0 is "wait forever" (i.e. infinite timeout) */
|
||||
if (intval < 0 || intval > INT_MAX / 1000) {
|
||||
pr_err("mount_timeout out of range\n");
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
opt->mount_timeout = msecs_to_jiffies(intval * 1000);
|
||||
break;
|
||||
|
||||
case Opt_share:
|
||||
@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
|
||||
seq_puts(m, "notcp_nodelay,");
|
||||
|
||||
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
|
||||
seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
|
||||
seq_printf(m, "mount_timeout=%d,",
|
||||
jiffies_to_msecs(opt->mount_timeout) / 1000);
|
||||
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
|
||||
seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
|
||||
seq_printf(m, "osd_idle_ttl=%d,",
|
||||
jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
|
||||
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
|
||||
seq_printf(m, "osdkeepalivetimeout=%d,",
|
||||
opt->osd_keepalive_timeout);
|
||||
jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
|
||||
|
||||
/* drop redundant comma */
|
||||
if (m->count != pos)
|
||||
@ -626,8 +647,8 @@ static int have_mon_and_osd_map(struct ceph_client *client)
|
||||
*/
|
||||
int __ceph_open_session(struct ceph_client *client, unsigned long started)
|
||||
{
|
||||
int err;
|
||||
unsigned long timeout = client->options->mount_timeout * HZ;
|
||||
unsigned long timeout = client->options->mount_timeout;
|
||||
long err;
|
||||
|
||||
/* open session, and wait for mon and osd maps */
|
||||
err = ceph_monc_open_session(&client->monc);
|
||||
@ -635,16 +656,15 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
|
||||
return err;
|
||||
|
||||
while (!have_mon_and_osd_map(client)) {
|
||||
err = -EIO;
|
||||
if (timeout && time_after_eq(jiffies, started + timeout))
|
||||
return err;
|
||||
return -ETIMEDOUT;
|
||||
|
||||
/* wait */
|
||||
dout("mount waiting for mon_map\n");
|
||||
err = wait_event_interruptible_timeout(client->auth_wq,
|
||||
have_mon_and_osd_map(client) || (client->auth_err < 0),
|
||||
timeout);
|
||||
if (err == -EINTR || err == -ERESTARTSYS)
|
||||
ceph_timeout_jiffies(timeout));
|
||||
if (err < 0)
|
||||
return err;
|
||||
if (client->auth_err < 0)
|
||||
return client->auth_err;
|
||||
@ -721,5 +741,5 @@ module_exit(exit_ceph_lib);
|
||||
MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
|
||||
MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
|
||||
MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
|
||||
MODULE_DESCRIPTION("Ceph filesystem for Linux");
|
||||
MODULE_DESCRIPTION("Ceph core library");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
@ -1,15 +1,11 @@
|
||||
|
||||
#ifdef __KERNEL__
|
||||
# include <linux/slab.h>
|
||||
# include <linux/crush/crush.h>
|
||||
#else
|
||||
# include <stdlib.h>
|
||||
# include <assert.h>
|
||||
# define kfree(x) do { if (x) free(x); } while (0)
|
||||
# define BUG_ON(x) assert(!(x))
|
||||
# include "crush_compat.h"
|
||||
# include "crush.h"
|
||||
#endif
|
||||
|
||||
#include <linux/crush/crush.h>
|
||||
|
||||
const char *crush_bucket_alg_name(int alg)
|
||||
{
|
||||
switch (alg) {
|
||||
@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
|
||||
kfree(map->rules);
|
||||
}
|
||||
|
||||
#ifndef __KERNEL__
|
||||
kfree(map->choose_tries);
|
||||
#endif
|
||||
kfree(map);
|
||||
}
|
||||
|
||||
|
@ -10,20 +10,20 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__linux__)
|
||||
#include <linux/types.h>
|
||||
#elif defined(__FreeBSD__)
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#ifndef CEPH_CRUSH_LN_H
|
||||
#define CEPH_CRUSH_LN_H
|
||||
|
||||
#ifdef __KERNEL__
|
||||
# include <linux/types.h>
|
||||
#else
|
||||
# include "crush_compat.h"
|
||||
#endif
|
||||
|
||||
// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
|
||||
// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
|
||||
|
||||
static int64_t __RH_LH_tbl[128*2+2] = {
|
||||
/*
|
||||
* RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
|
||||
* RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
|
||||
*/
|
||||
static __s64 __RH_LH_tbl[128*2+2] = {
|
||||
0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
|
||||
0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
|
||||
0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
|
||||
@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
|
||||
0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
|
||||
0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
|
||||
0x0000800000000000ll, 0x0000ffff00000000ll,
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
// LL_tbl[k] = 2^48*log2(1.0+k/2^15);
|
||||
static int64_t __LL_tbl[256] = {
|
||||
/*
|
||||
* LL_tbl[k] = 2^48*log2(1.0+k/2^15)
|
||||
*/
|
||||
static __s64 __LL_tbl[256] = {
|
||||
0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
|
||||
0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
|
||||
0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
|
||||
@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = {
|
||||
0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
@ -1,6 +1,8 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/crush/hash.h>
|
||||
#ifdef __KERNEL__
|
||||
# include <linux/crush/hash.h>
|
||||
#else
|
||||
# include "hash.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Robert Jenkins' function for mixing 32-bit values
|
||||
|
@ -1,27 +1,31 @@
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2015 Intel Corporation All Rights Reserved
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef __KERNEL__
|
||||
# include <linux/string.h>
|
||||
# include <linux/slab.h>
|
||||
# include <linux/bug.h>
|
||||
# include <linux/kernel.h>
|
||||
# ifndef dprintk
|
||||
# define dprintk(args...)
|
||||
# endif
|
||||
# include <linux/crush/crush.h>
|
||||
# include <linux/crush/hash.h>
|
||||
#else
|
||||
# include <string.h>
|
||||
# include <stdio.h>
|
||||
# include <stdlib.h>
|
||||
# include <assert.h>
|
||||
# define BUG_ON(x) assert(!(x))
|
||||
# define dprintk(args...) /* printf(args) */
|
||||
# define kmalloc(x, f) malloc(x)
|
||||
# define kfree(x) free(x)
|
||||
# include "crush_compat.h"
|
||||
# include "crush.h"
|
||||
# include "hash.h"
|
||||
#endif
|
||||
|
||||
#include <linux/crush/crush.h>
|
||||
#include <linux/crush/hash.h>
|
||||
#include "crush_ln_table.h"
|
||||
|
||||
#define dprintk(args...) /* printf(args) */
|
||||
|
||||
/*
|
||||
* Implement the core CRUSH mapping algorithm.
|
||||
*/
|
||||
@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
|
||||
int i;
|
||||
|
||||
for (i = bucket->h.size-1; i >= 0; i--) {
|
||||
__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
|
||||
__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
|
||||
r, bucket->h.id);
|
||||
w &= 0xffff;
|
||||
dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
|
||||
@ -238,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
|
||||
return bucket->h.items[high];
|
||||
}
|
||||
|
||||
// compute 2^44*log2(input+1)
|
||||
uint64_t crush_ln(unsigned xin)
|
||||
/* compute 2^44*log2(input+1) */
|
||||
static __u64 crush_ln(unsigned int xin)
|
||||
{
|
||||
unsigned x=xin, x1;
|
||||
int iexpon, index1, index2;
|
||||
uint64_t RH, LH, LL, xl64, result;
|
||||
unsigned int x = xin, x1;
|
||||
int iexpon, index1, index2;
|
||||
__u64 RH, LH, LL, xl64, result;
|
||||
|
||||
x++;
|
||||
x++;
|
||||
|
||||
// normalize input
|
||||
iexpon = 15;
|
||||
while(!(x&0x18000)) { x<<=1; iexpon--; }
|
||||
/* normalize input */
|
||||
iexpon = 15;
|
||||
while (!(x & 0x18000)) {
|
||||
x <<= 1;
|
||||
iexpon--;
|
||||
}
|
||||
|
||||
index1 = (x>>8)<<1;
|
||||
// RH ~ 2^56/index1
|
||||
RH = __RH_LH_tbl[index1 - 256];
|
||||
// LH ~ 2^48 * log2(index1/256)
|
||||
LH = __RH_LH_tbl[index1 + 1 - 256];
|
||||
index1 = (x >> 8) << 1;
|
||||
/* RH ~ 2^56/index1 */
|
||||
RH = __RH_LH_tbl[index1 - 256];
|
||||
/* LH ~ 2^48 * log2(index1/256) */
|
||||
LH = __RH_LH_tbl[index1 + 1 - 256];
|
||||
|
||||
// RH*x ~ 2^48 * (2^15 + xf), xf<2^8
|
||||
xl64 = (int64_t)x * RH;
|
||||
xl64 >>= 48;
|
||||
x1 = xl64;
|
||||
/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
|
||||
xl64 = (__s64)x * RH;
|
||||
xl64 >>= 48;
|
||||
x1 = xl64;
|
||||
|
||||
result = iexpon;
|
||||
result <<= (12 + 32);
|
||||
result = iexpon;
|
||||
result <<= (12 + 32);
|
||||
|
||||
index2 = x1 & 0xff;
|
||||
// LL ~ 2^48*log2(1.0+index2/2^15)
|
||||
LL = __LL_tbl[index2];
|
||||
index2 = x1 & 0xff;
|
||||
/* LL ~ 2^48*log2(1.0+index2/2^15) */
|
||||
LL = __LL_tbl[index2];
|
||||
|
||||
LH = LH + LL;
|
||||
LH = LH + LL;
|
||||
|
||||
LH >>= (48-12 - 32);
|
||||
result += LH;
|
||||
LH >>= (48 - 12 - 32);
|
||||
result += LH;
|
||||
|
||||
return result;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@ -290,9 +297,9 @@ uint64_t crush_ln(unsigned xin)
|
||||
static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
|
||||
int x, int r)
|
||||
{
|
||||
unsigned i, high = 0;
|
||||
unsigned u;
|
||||
unsigned w;
|
||||
unsigned int i, high = 0;
|
||||
unsigned int u;
|
||||
unsigned int w;
|
||||
__s64 ln, draw, high_draw = 0;
|
||||
|
||||
for (i = 0; i < bucket->h.size; i++) {
|
||||
@ -567,6 +574,10 @@ reject:
|
||||
out[outpos] = item;
|
||||
outpos++;
|
||||
count--;
|
||||
#ifndef __KERNEL__
|
||||
if (map->choose_tries && ftotal <= map->choose_total_tries)
|
||||
map->choose_tries[ftotal]++;
|
||||
#endif
|
||||
}
|
||||
|
||||
dprintk("CHOOSE returns %d\n", outpos);
|
||||
@ -610,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map,
|
||||
}
|
||||
|
||||
for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
|
||||
#ifdef DEBUG_INDEP
|
||||
if (out2 && ftotal) {
|
||||
dprintk("%u %d a: ", ftotal, left);
|
||||
for (rep = outpos; rep < endpos; rep++) {
|
||||
dprintk(" %d", out[rep]);
|
||||
}
|
||||
dprintk("\n");
|
||||
dprintk("%u %d b: ", ftotal, left);
|
||||
for (rep = outpos; rep < endpos; rep++) {
|
||||
dprintk(" %d", out2[rep]);
|
||||
}
|
||||
dprintk("\n");
|
||||
}
|
||||
#endif
|
||||
for (rep = outpos; rep < endpos; rep++) {
|
||||
if (out[rep] != CRUSH_ITEM_UNDEF)
|
||||
continue;
|
||||
@ -726,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map,
|
||||
out2[rep] = CRUSH_ITEM_NONE;
|
||||
}
|
||||
}
|
||||
#ifndef __KERNEL__
|
||||
if (map->choose_tries && ftotal <= map->choose_total_tries)
|
||||
map->choose_tries[ftotal]++;
|
||||
#endif
|
||||
#ifdef DEBUG_INDEP
|
||||
if (out2) {
|
||||
dprintk("%u %d a: ", ftotal, left);
|
||||
for (rep = outpos; rep < endpos; rep++) {
|
||||
dprintk(" %d", out[rep]);
|
||||
}
|
||||
dprintk("\n");
|
||||
dprintk("%u %d b: ", ftotal, left);
|
||||
for (rep = outpos; rep < endpos; rep++) {
|
||||
dprintk(" %d", out2[rep]);
|
||||
}
|
||||
dprintk("\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@ -790,8 +833,15 @@ int crush_do_rule(const struct crush_map *map,
|
||||
|
||||
switch (curstep->op) {
|
||||
case CRUSH_RULE_TAKE:
|
||||
w[0] = curstep->arg1;
|
||||
wsize = 1;
|
||||
if ((curstep->arg1 >= 0 &&
|
||||
curstep->arg1 < map->max_devices) ||
|
||||
(-1-curstep->arg1 < map->max_buckets &&
|
||||
map->buckets[-1-curstep->arg1])) {
|
||||
w[0] = curstep->arg1;
|
||||
wsize = 1;
|
||||
} else {
|
||||
dprintk(" bad take value %d\n", curstep->arg1);
|
||||
}
|
||||
break;
|
||||
|
||||
case CRUSH_RULE_SET_CHOOSE_TRIES:
|
||||
@ -877,7 +927,7 @@ int crush_do_rule(const struct crush_map *map,
|
||||
0);
|
||||
} else {
|
||||
out_size = ((numrep < (result_max-osize)) ?
|
||||
numrep : (result_max-osize));
|
||||
numrep : (result_max-osize));
|
||||
crush_choose_indep(
|
||||
map,
|
||||
map->buckets[-1-w[i]],
|
||||
@ -923,5 +973,3 @@ int crush_do_rule(const struct crush_map *map,
|
||||
}
|
||||
return result_len;
|
||||
}
|
||||
|
||||
|
||||
|
@ -278,7 +278,6 @@ static void _ceph_msgr_exit(void)
|
||||
ceph_msgr_slab_exit();
|
||||
|
||||
BUG_ON(zero_page == NULL);
|
||||
kunmap(zero_page);
|
||||
page_cache_release(zero_page);
|
||||
zero_page = NULL;
|
||||
}
|
||||
@ -1545,7 +1544,7 @@ static int write_partial_message_data(struct ceph_connection *con)
|
||||
page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
|
||||
&last_piece);
|
||||
ret = ceph_tcp_sendpage(con->sock, page, page_offset,
|
||||
length, last_piece);
|
||||
length, !last_piece);
|
||||
if (ret <= 0) {
|
||||
if (do_datacrc)
|
||||
msg->footer.data_crc = cpu_to_le32(crc);
|
||||
|
@ -298,21 +298,28 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
|
||||
}
|
||||
EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
|
||||
|
||||
/*
|
||||
* Wait for an osdmap with a given epoch.
|
||||
*
|
||||
* @epoch: epoch to wait for
|
||||
* @timeout: in jiffies, 0 means "wait forever"
|
||||
*/
|
||||
int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
|
||||
unsigned long timeout)
|
||||
{
|
||||
unsigned long started = jiffies;
|
||||
int ret;
|
||||
long ret;
|
||||
|
||||
mutex_lock(&monc->mutex);
|
||||
while (monc->have_osdmap < epoch) {
|
||||
mutex_unlock(&monc->mutex);
|
||||
|
||||
if (timeout != 0 && time_after_eq(jiffies, started + timeout))
|
||||
if (timeout && time_after_eq(jiffies, started + timeout))
|
||||
return -ETIMEDOUT;
|
||||
|
||||
ret = wait_event_interruptible_timeout(monc->client->auth_wq,
|
||||
monc->have_osdmap >= epoch, timeout);
|
||||
monc->have_osdmap >= epoch,
|
||||
ceph_timeout_jiffies(timeout));
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
|
@ -296,6 +296,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
|
||||
case CEPH_OSD_OP_CMPXATTR:
|
||||
ceph_osd_data_release(&op->xattr.osd_data);
|
||||
break;
|
||||
case CEPH_OSD_OP_STAT:
|
||||
ceph_osd_data_release(&op->raw_data_in);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -450,7 +453,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
|
||||
*/
|
||||
static struct ceph_osd_req_op *
|
||||
_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
|
||||
u16 opcode)
|
||||
u16 opcode, u32 flags)
|
||||
{
|
||||
struct ceph_osd_req_op *op;
|
||||
|
||||
@ -460,14 +463,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
|
||||
op = &osd_req->r_ops[which];
|
||||
memset(op, 0, sizeof (*op));
|
||||
op->op = opcode;
|
||||
op->flags = flags;
|
||||
|
||||
return op;
|
||||
}
|
||||
|
||||
void osd_req_op_init(struct ceph_osd_request *osd_req,
|
||||
unsigned int which, u16 opcode)
|
||||
unsigned int which, u16 opcode, u32 flags)
|
||||
{
|
||||
(void)_osd_req_op_init(osd_req, which, opcode);
|
||||
(void)_osd_req_op_init(osd_req, which, opcode, flags);
|
||||
}
|
||||
EXPORT_SYMBOL(osd_req_op_init);
|
||||
|
||||
@ -476,7 +480,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
|
||||
u64 offset, u64 length,
|
||||
u64 truncate_size, u32 truncate_seq)
|
||||
{
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
||||
opcode, 0);
|
||||
size_t payload_len = 0;
|
||||
|
||||
BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
|
||||
@ -515,7 +520,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update);
|
||||
void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
|
||||
u16 opcode, const char *class, const char *method)
|
||||
{
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
||||
opcode, 0);
|
||||
struct ceph_pagelist *pagelist;
|
||||
size_t payload_len = 0;
|
||||
size_t size;
|
||||
@ -552,7 +558,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
|
||||
u16 opcode, const char *name, const void *value,
|
||||
size_t size, u8 cmp_op, u8 cmp_mode)
|
||||
{
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
||||
opcode, 0);
|
||||
struct ceph_pagelist *pagelist;
|
||||
size_t payload_len;
|
||||
|
||||
@ -585,7 +592,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
|
||||
unsigned int which, u16 opcode,
|
||||
u64 cookie, u64 version, int flag)
|
||||
{
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
||||
opcode, 0);
|
||||
|
||||
BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
|
||||
|
||||
@ -602,7 +610,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
|
||||
u64 expected_write_size)
|
||||
{
|
||||
struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
|
||||
CEPH_OSD_OP_SETALLOCHINT);
|
||||
CEPH_OSD_OP_SETALLOCHINT,
|
||||
0);
|
||||
|
||||
op->alloc_hint.expected_object_size = expected_object_size;
|
||||
op->alloc_hint.expected_write_size = expected_write_size;
|
||||
@ -786,7 +795,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
|
||||
}
|
||||
|
||||
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
|
||||
osd_req_op_init(req, which, opcode);
|
||||
osd_req_op_init(req, which, opcode, 0);
|
||||
} else {
|
||||
u32 object_size = le32_to_cpu(layout->fl_object_size);
|
||||
u32 object_base = off - objoff;
|
||||
@ -1088,7 +1097,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
|
||||
BUG_ON(!list_empty(&osd->o_osd_lru));
|
||||
|
||||
list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
|
||||
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
|
||||
osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
|
||||
}
|
||||
|
||||
static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
|
||||
@ -1199,7 +1208,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
|
||||
static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
|
||||
{
|
||||
schedule_delayed_work(&osdc->timeout_work,
|
||||
osdc->client->options->osd_keepalive_timeout * HZ);
|
||||
osdc->client->options->osd_keepalive_timeout);
|
||||
}
|
||||
|
||||
static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
|
||||
@ -1567,10 +1576,9 @@ static void handle_timeout(struct work_struct *work)
|
||||
{
|
||||
struct ceph_osd_client *osdc =
|
||||
container_of(work, struct ceph_osd_client, timeout_work.work);
|
||||
struct ceph_options *opts = osdc->client->options;
|
||||
struct ceph_osd_request *req;
|
||||
struct ceph_osd *osd;
|
||||
unsigned long keepalive =
|
||||
osdc->client->options->osd_keepalive_timeout * HZ;
|
||||
struct list_head slow_osds;
|
||||
dout("timeout\n");
|
||||
down_read(&osdc->map_sem);
|
||||
@ -1586,7 +1594,8 @@ static void handle_timeout(struct work_struct *work)
|
||||
*/
|
||||
INIT_LIST_HEAD(&slow_osds);
|
||||
list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
|
||||
if (time_before(jiffies, req->r_stamp + keepalive))
|
||||
if (time_before(jiffies,
|
||||
req->r_stamp + opts->osd_keepalive_timeout))
|
||||
break;
|
||||
|
||||
osd = req->r_osd;
|
||||
@ -1613,8 +1622,7 @@ static void handle_osds_timeout(struct work_struct *work)
|
||||
struct ceph_osd_client *osdc =
|
||||
container_of(work, struct ceph_osd_client,
|
||||
osds_timeout_work.work);
|
||||
unsigned long delay =
|
||||
osdc->client->options->osd_idle_ttl * HZ >> 2;
|
||||
unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
|
||||
|
||||
dout("osds timeout\n");
|
||||
down_read(&osdc->map_sem);
|
||||
@ -2619,7 +2627,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
|
||||
osdc->event_count = 0;
|
||||
|
||||
schedule_delayed_work(&osdc->osds_timeout_work,
|
||||
round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
|
||||
round_jiffies_relative(osdc->client->options->osd_idle_ttl));
|
||||
|
||||
err = -ENOMEM;
|
||||
osdc->req_mempool = mempool_create_kmalloc_pool(10,
|
||||
|
@ -89,7 +89,7 @@ static int crush_decode_tree_bucket(void **p, void *end,
|
||||
{
|
||||
int j;
|
||||
dout("crush_decode_tree_bucket %p to %p\n", *p, end);
|
||||
ceph_decode_32_safe(p, end, b->num_nodes, bad);
|
||||
ceph_decode_8_safe(p, end, b->num_nodes, bad);
|
||||
b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
|
||||
if (b->node_weights == NULL)
|
||||
return -ENOMEM;
|
||||
|
@ -51,10 +51,7 @@ void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
|
||||
set_page_dirty_lock(pages[i]);
|
||||
put_page(pages[i]);
|
||||
}
|
||||
if (is_vmalloc_addr(pages))
|
||||
vfree(pages);
|
||||
else
|
||||
kfree(pages);
|
||||
kvfree(pages);
|
||||
}
|
||||
EXPORT_SYMBOL(ceph_put_page_vector);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user