mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-10 07:00:48 +00:00
This commit is contained in:
commit
29552b1462
@ -12,10 +12,14 @@ cifs.txt
|
||||
- description of the CIFS filesystem
|
||||
coda.txt
|
||||
- description of the CODA filesystem.
|
||||
configfs/
|
||||
- directory containing configfs documentation and example code.
|
||||
cramfs.txt
|
||||
- info on the cram filesystem for small storage (ROMs etc)
|
||||
devfs/
|
||||
- directory containing devfs documentation.
|
||||
dlmfs.txt
|
||||
- info on the userspace interface to the OCFS2 DLM.
|
||||
ext2.txt
|
||||
- info, mount options and specifications for the Ext2 filesystem.
|
||||
hpfs.txt
|
||||
@ -30,6 +34,8 @@ ntfs.txt
|
||||
- info and mount options for the NTFS filesystem (Windows NT).
|
||||
proc.txt
|
||||
- info on Linux's /proc filesystem.
|
||||
ocfs2.txt
|
||||
- info and mount options for the OCFS2 clustered filesystem.
|
||||
romfs.txt
|
||||
- Description of the ROMFS filesystem.
|
||||
smbfs.txt
|
||||
|
434
Documentation/filesystems/configfs/configfs.txt
Normal file
434
Documentation/filesystems/configfs/configfs.txt
Normal file
@ -0,0 +1,434 @@
|
||||
|
||||
configfs - Userspace-driven kernel object configuation.
|
||||
|
||||
Joel Becker <joel.becker@oracle.com>
|
||||
|
||||
Updated: 31 March 2005
|
||||
|
||||
Copyright (c) 2005 Oracle Corporation,
|
||||
Joel Becker <joel.becker@oracle.com>
|
||||
|
||||
|
||||
[What is configfs?]
|
||||
|
||||
configfs is a ram-based filesystem that provides the converse of
|
||||
sysfs's functionality. Where sysfs is a filesystem-based view of
|
||||
kernel objects, configfs is a filesystem-based manager of kernel
|
||||
objects, or config_items.
|
||||
|
||||
With sysfs, an object is created in kernel (for example, when a device
|
||||
is discovered) and it is registered with sysfs. Its attributes then
|
||||
appear in sysfs, allowing userspace to read the attributes via
|
||||
readdir(3)/read(2). It may allow some attributes to be modified via
|
||||
write(2). The important point is that the object is created and
|
||||
destroyed in kernel, the kernel controls the lifecycle of the sysfs
|
||||
representation, and sysfs is merely a window on all this.
|
||||
|
||||
A configfs config_item is created via an explicit userspace operation:
|
||||
mkdir(2). It is destroyed via rmdir(2). The attributes appear at
|
||||
mkdir(2) time, and can be read or modified via read(2) and write(2).
|
||||
As with sysfs, readdir(3) queries the list of items and/or attributes.
|
||||
symlink(2) can be used to group items together. Unlike sysfs, the
|
||||
lifetime of the representation is completely driven by userspace. The
|
||||
kernel modules backing the items must respond to this.
|
||||
|
||||
Both sysfs and configfs can and should exist together on the same
|
||||
system. One is not a replacement for the other.
|
||||
|
||||
[Using configfs]
|
||||
|
||||
configfs can be compiled as a module or into the kernel. You can access
|
||||
it by doing
|
||||
|
||||
mount -t configfs none /config
|
||||
|
||||
The configfs tree will be empty unless client modules are also loaded.
|
||||
These are modules that register their item types with configfs as
|
||||
subsystems. Once a client subsystem is loaded, it will appear as a
|
||||
subdirectory (or more than one) under /config. Like sysfs, the
|
||||
configfs tree is always there, whether mounted on /config or not.
|
||||
|
||||
An item is created via mkdir(2). The item's attributes will also
|
||||
appear at this time. readdir(3) can determine what the attributes are,
|
||||
read(2) can query their default values, and write(2) can store new
|
||||
values. Like sysfs, attributes should be ASCII text files, preferably
|
||||
with only one value per file. The same efficiency caveats from sysfs
|
||||
apply. Don't mix more than one attribute in one attribute file.
|
||||
|
||||
Like sysfs, configfs expects write(2) to store the entire buffer at
|
||||
once. When writing to configfs attributes, userspace processes should
|
||||
first read the entire file, modify the portions they wish to change, and
|
||||
then write the entire buffer back. Attribute files have a maximum size
|
||||
of one page (PAGE_SIZE, 4096 on i386).
|
||||
|
||||
When an item needs to be destroyed, remove it with rmdir(2). An
|
||||
item cannot be destroyed if any other item has a link to it (via
|
||||
symlink(2)). Links can be removed via unlink(2).
|
||||
|
||||
[Configuring FakeNBD: an Example]
|
||||
|
||||
Imagine there's a Network Block Device (NBD) driver that allows you to
|
||||
access remote block devices. Call it FakeNBD. FakeNBD uses configfs
|
||||
for its configuration. Obviously, there will be a nice program that
|
||||
sysadmins use to configure FakeNBD, but somehow that program has to tell
|
||||
the driver about it. Here's where configfs comes in.
|
||||
|
||||
When the FakeNBD driver is loaded, it registers itself with configfs.
|
||||
readdir(3) sees this just fine:
|
||||
|
||||
# ls /config
|
||||
fakenbd
|
||||
|
||||
A fakenbd connection can be created with mkdir(2). The name is
|
||||
arbitrary, but likely the tool will make some use of the name. Perhaps
|
||||
it is a uuid or a disk name:
|
||||
|
||||
# mkdir /config/fakenbd/disk1
|
||||
# ls /config/fakenbd/disk1
|
||||
target device rw
|
||||
|
||||
The target attribute contains the IP address of the server FakeNBD will
|
||||
connect to. The device attribute is the device on the server.
|
||||
Predictably, the rw attribute determines whether the connection is
|
||||
read-only or read-write.
|
||||
|
||||
# echo 10.0.0.1 > /config/fakenbd/disk1/target
|
||||
# echo /dev/sda1 > /config/fakenbd/disk1/device
|
||||
# echo 1 > /config/fakenbd/disk1/rw
|
||||
|
||||
That's it. That's all there is. Now the device is configured, via the
|
||||
shell no less.
|
||||
|
||||
[Coding With configfs]
|
||||
|
||||
Every object in configfs is a config_item. A config_item reflects an
|
||||
object in the subsystem. It has attributes that match values on that
|
||||
object. configfs handles the filesystem representation of that object
|
||||
and its attributes, allowing the subsystem to ignore all but the
|
||||
basic show/store interaction.
|
||||
|
||||
Items are created and destroyed inside a config_group. A group is a
|
||||
collection of items that share the same attributes and operations.
|
||||
Items are created by mkdir(2) and removed by rmdir(2), but configfs
|
||||
handles that. The group has a set of operations to perform these tasks
|
||||
|
||||
A subsystem is the top level of a client module. During initialization,
|
||||
the client module registers the subsystem with configfs, the subsystem
|
||||
appears as a directory at the top of the configfs filesystem. A
|
||||
subsystem is also a config_group, and can do everything a config_group
|
||||
can.
|
||||
|
||||
[struct config_item]
|
||||
|
||||
struct config_item {
|
||||
char *ci_name;
|
||||
char ci_namebuf[UOBJ_NAME_LEN];
|
||||
struct kref ci_kref;
|
||||
struct list_head ci_entry;
|
||||
struct config_item *ci_parent;
|
||||
struct config_group *ci_group;
|
||||
struct config_item_type *ci_type;
|
||||
struct dentry *ci_dentry;
|
||||
};
|
||||
|
||||
void config_item_init(struct config_item *);
|
||||
void config_item_init_type_name(struct config_item *,
|
||||
const char *name,
|
||||
struct config_item_type *type);
|
||||
struct config_item *config_item_get(struct config_item *);
|
||||
void config_item_put(struct config_item *);
|
||||
|
||||
Generally, struct config_item is embedded in a container structure, a
|
||||
structure that actually represents what the subsystem is doing. The
|
||||
config_item portion of that structure is how the object interacts with
|
||||
configfs.
|
||||
|
||||
Whether statically defined in a source file or created by a parent
|
||||
config_group, a config_item must have one of the _init() functions
|
||||
called on it. This initializes the reference count and sets up the
|
||||
appropriate fields.
|
||||
|
||||
All users of a config_item should have a reference on it via
|
||||
config_item_get(), and drop the reference when they are done via
|
||||
config_item_put().
|
||||
|
||||
By itself, a config_item cannot do much more than appear in configfs.
|
||||
Usually a subsystem wants the item to display and/or store attributes,
|
||||
among other things. For that, it needs a type.
|
||||
|
||||
[struct config_item_type]
|
||||
|
||||
struct configfs_item_operations {
|
||||
void (*release)(struct config_item *);
|
||||
ssize_t (*show_attribute)(struct config_item *,
|
||||
struct configfs_attribute *,
|
||||
char *);
|
||||
ssize_t (*store_attribute)(struct config_item *,
|
||||
struct configfs_attribute *,
|
||||
const char *, size_t);
|
||||
int (*allow_link)(struct config_item *src,
|
||||
struct config_item *target);
|
||||
int (*drop_link)(struct config_item *src,
|
||||
struct config_item *target);
|
||||
};
|
||||
|
||||
struct config_item_type {
|
||||
struct module *ct_owner;
|
||||
struct configfs_item_operations *ct_item_ops;
|
||||
struct configfs_group_operations *ct_group_ops;
|
||||
struct configfs_attribute **ct_attrs;
|
||||
};
|
||||
|
||||
The most basic function of a config_item_type is to define what
|
||||
operations can be performed on a config_item. All items that have been
|
||||
allocated dynamically will need to provide the ct_item_ops->release()
|
||||
method. This method is called when the config_item's reference count
|
||||
reaches zero. Items that wish to display an attribute need to provide
|
||||
the ct_item_ops->show_attribute() method. Similarly, storing a new
|
||||
attribute value uses the store_attribute() method.
|
||||
|
||||
[struct configfs_attribute]
|
||||
|
||||
struct configfs_attribute {
|
||||
char *ca_name;
|
||||
struct module *ca_owner;
|
||||
mode_t ca_mode;
|
||||
};
|
||||
|
||||
When a config_item wants an attribute to appear as a file in the item's
|
||||
configfs directory, it must define a configfs_attribute describing it.
|
||||
It then adds the attribute to the NULL-terminated array
|
||||
config_item_type->ct_attrs. When the item appears in configfs, the
|
||||
attribute file will appear with the configfs_attribute->ca_name
|
||||
filename. configfs_attribute->ca_mode specifies the file permissions.
|
||||
|
||||
If an attribute is readable and the config_item provides a
|
||||
ct_item_ops->show_attribute() method, that method will be called
|
||||
whenever userspace asks for a read(2) on the attribute. The converse
|
||||
will happen for write(2).
|
||||
|
||||
[struct config_group]
|
||||
|
||||
A config_item cannot live in a vaccum. The only way one can be created
|
||||
is via mkdir(2) on a config_group. This will trigger creation of a
|
||||
child item.
|
||||
|
||||
struct config_group {
|
||||
struct config_item cg_item;
|
||||
struct list_head cg_children;
|
||||
struct configfs_subsystem *cg_subsys;
|
||||
struct config_group **default_groups;
|
||||
};
|
||||
|
||||
void config_group_init(struct config_group *group);
|
||||
void config_group_init_type_name(struct config_group *group,
|
||||
const char *name,
|
||||
struct config_item_type *type);
|
||||
|
||||
|
||||
The config_group structure contains a config_item. Properly configuring
|
||||
that item means that a group can behave as an item in its own right.
|
||||
However, it can do more: it can create child items or groups. This is
|
||||
accomplished via the group operations specified on the group's
|
||||
config_item_type.
|
||||
|
||||
struct configfs_group_operations {
|
||||
struct config_item *(*make_item)(struct config_group *group,
|
||||
const char *name);
|
||||
struct config_group *(*make_group)(struct config_group *group,
|
||||
const char *name);
|
||||
int (*commit_item)(struct config_item *item);
|
||||
void (*drop_item)(struct config_group *group,
|
||||
struct config_item *item);
|
||||
};
|
||||
|
||||
A group creates child items by providing the
|
||||
ct_group_ops->make_item() method. If provided, this method is called from mkdir(2) in the group's directory. The subsystem allocates a new
|
||||
config_item (or more likely, its container structure), initializes it,
|
||||
and returns it to configfs. Configfs will then populate the filesystem
|
||||
tree to reflect the new item.
|
||||
|
||||
If the subsystem wants the child to be a group itself, the subsystem
|
||||
provides ct_group_ops->make_group(). Everything else behaves the same,
|
||||
using the group _init() functions on the group.
|
||||
|
||||
Finally, when userspace calls rmdir(2) on the item or group,
|
||||
ct_group_ops->drop_item() is called. As a config_group is also a
|
||||
config_item, it is not necessary for a seperate drop_group() method.
|
||||
The subsystem must config_item_put() the reference that was initialized
|
||||
upon item allocation. If a subsystem has no work to do, it may omit
|
||||
the ct_group_ops->drop_item() method, and configfs will call
|
||||
config_item_put() on the item on behalf of the subsystem.
|
||||
|
||||
IMPORTANT: drop_item() is void, and as such cannot fail. When rmdir(2)
|
||||
is called, configfs WILL remove the item from the filesystem tree
|
||||
(assuming that it has no children to keep it busy). The subsystem is
|
||||
responsible for responding to this. If the subsystem has references to
|
||||
the item in other threads, the memory is safe. It may take some time
|
||||
for the item to actually disappear from the subsystem's usage. But it
|
||||
is gone from configfs.
|
||||
|
||||
A config_group cannot be removed while it still has child items. This
|
||||
is implemented in the configfs rmdir(2) code. ->drop_item() will not be
|
||||
called, as the item has not been dropped. rmdir(2) will fail, as the
|
||||
directory is not empty.
|
||||
|
||||
[struct configfs_subsystem]
|
||||
|
||||
A subsystem must register itself, ususally at module_init time. This
|
||||
tells configfs to make the subsystem appear in the file tree.
|
||||
|
||||
struct configfs_subsystem {
|
||||
struct config_group su_group;
|
||||
struct semaphore su_sem;
|
||||
};
|
||||
|
||||
int configfs_register_subsystem(struct configfs_subsystem *subsys);
|
||||
void configfs_unregister_subsystem(struct configfs_subsystem *subsys);
|
||||
|
||||
A subsystem consists of a toplevel config_group and a semaphore.
|
||||
The group is where child config_items are created. For a subsystem,
|
||||
this group is usually defined statically. Before calling
|
||||
configfs_register_subsystem(), the subsystem must have initialized the
|
||||
group via the usual group _init() functions, and it must also have
|
||||
initialized the semaphore.
|
||||
When the register call returns, the subsystem is live, and it
|
||||
will be visible via configfs. At that point, mkdir(2) can be called and
|
||||
the subsystem must be ready for it.
|
||||
|
||||
[An Example]
|
||||
|
||||
The best example of these basic concepts is the simple_children
|
||||
subsystem/group and the simple_child item in configfs_example.c It
|
||||
shows a trivial object displaying and storing an attribute, and a simple
|
||||
group creating and destroying these children.
|
||||
|
||||
[Hierarchy Navigation and the Subsystem Semaphore]
|
||||
|
||||
There is an extra bonus that configfs provides. The config_groups and
|
||||
config_items are arranged in a hierarchy due to the fact that they
|
||||
appear in a filesystem. A subsystem is NEVER to touch the filesystem
|
||||
parts, but the subsystem might be interested in this hierarchy. For
|
||||
this reason, the hierarchy is mirrored via the config_group->cg_children
|
||||
and config_item->ci_parent structure members.
|
||||
|
||||
A subsystem can navigate the cg_children list and the ci_parent pointer
|
||||
to see the tree created by the subsystem. This can race with configfs'
|
||||
management of the hierarchy, so configfs uses the subsystem semaphore to
|
||||
protect modifications. Whenever a subsystem wants to navigate the
|
||||
hierarchy, it must do so under the protection of the subsystem
|
||||
semaphore.
|
||||
|
||||
A subsystem will be prevented from acquiring the semaphore while a newly
|
||||
allocated item has not been linked into this hierarchy. Similarly, it
|
||||
will not be able to acquire the semaphore while a dropping item has not
|
||||
yet been unlinked. This means that an item's ci_parent pointer will
|
||||
never be NULL while the item is in configfs, and that an item will only
|
||||
be in its parent's cg_children list for the same duration. This allows
|
||||
a subsystem to trust ci_parent and cg_children while they hold the
|
||||
semaphore.
|
||||
|
||||
[Item Aggregation Via symlink(2)]
|
||||
|
||||
configfs provides a simple group via the group->item parent/child
|
||||
relationship. Often, however, a larger environment requires aggregation
|
||||
outside of the parent/child connection. This is implemented via
|
||||
symlink(2).
|
||||
|
||||
A config_item may provide the ct_item_ops->allow_link() and
|
||||
ct_item_ops->drop_link() methods. If the ->allow_link() method exists,
|
||||
symlink(2) may be called with the config_item as the source of the link.
|
||||
These links are only allowed between configfs config_items. Any
|
||||
symlink(2) attempt outside the configfs filesystem will be denied.
|
||||
|
||||
When symlink(2) is called, the source config_item's ->allow_link()
|
||||
method is called with itself and a target item. If the source item
|
||||
allows linking to target item, it returns 0. A source item may wish to
|
||||
reject a link if it only wants links to a certain type of object (say,
|
||||
in its own subsystem).
|
||||
|
||||
When unlink(2) is called on the symbolic link, the source item is
|
||||
notified via the ->drop_link() method. Like the ->drop_item() method,
|
||||
this is a void function and cannot return failure. The subsystem is
|
||||
responsible for responding to the change.
|
||||
|
||||
A config_item cannot be removed while it links to any other item, nor
|
||||
can it be removed while an item links to it. Dangling symlinks are not
|
||||
allowed in configfs.
|
||||
|
||||
[Automatically Created Subgroups]
|
||||
|
||||
A new config_group may want to have two types of child config_items.
|
||||
While this could be codified by magic names in ->make_item(), it is much
|
||||
more explicit to have a method whereby userspace sees this divergence.
|
||||
|
||||
Rather than have a group where some items behave differently than
|
||||
others, configfs provides a method whereby one or many subgroups are
|
||||
automatically created inside the parent at its creation. Thus,
|
||||
mkdir("parent) results in "parent", "parent/subgroup1", up through
|
||||
"parent/subgroupN". Items of type 1 can now be created in
|
||||
"parent/subgroup1", and items of type N can be created in
|
||||
"parent/subgroupN".
|
||||
|
||||
These automatic subgroups, or default groups, do not preclude other
|
||||
children of the parent group. If ct_group_ops->make_group() exists,
|
||||
other child groups can be created on the parent group directly.
|
||||
|
||||
A configfs subsystem specifies default groups by filling in the
|
||||
NULL-terminated array default_groups on the config_group structure.
|
||||
Each group in that array is populated in the configfs tree at the same
|
||||
time as the parent group. Similarly, they are removed at the same time
|
||||
as the parent. No extra notification is provided. When a ->drop_item()
|
||||
method call notifies the subsystem the parent group is going away, it
|
||||
also means every default group child associated with that parent group.
|
||||
|
||||
As a consequence of this, default_groups cannot be removed directly via
|
||||
rmdir(2). They also are not considered when rmdir(2) on the parent
|
||||
group is checking for children.
|
||||
|
||||
[Committable Items]
|
||||
|
||||
NOTE: Committable items are currently unimplemented.
|
||||
|
||||
Some config_items cannot have a valid initial state. That is, no
|
||||
default values can be specified for the item's attributes such that the
|
||||
item can do its work. Userspace must configure one or more attributes,
|
||||
after which the subsystem can start whatever entity this item
|
||||
represents.
|
||||
|
||||
Consider the FakeNBD device from above. Without a target address *and*
|
||||
a target device, the subsystem has no idea what block device to import.
|
||||
The simple example assumes that the subsystem merely waits until all the
|
||||
appropriate attributes are configured, and then connects. This will,
|
||||
indeed, work, but now every attribute store must check if the attributes
|
||||
are initialized. Every attribute store must fire off the connection if
|
||||
that condition is met.
|
||||
|
||||
Far better would be an explicit action notifying the subsystem that the
|
||||
config_item is ready to go. More importantly, an explicit action allows
|
||||
the subsystem to provide feedback as to whether the attibutes are
|
||||
initialized in a way that makes sense. configfs provides this as
|
||||
committable items.
|
||||
|
||||
configfs still uses only normal filesystem operations. An item is
|
||||
committed via rename(2). The item is moved from a directory where it
|
||||
can be modified to a directory where it cannot.
|
||||
|
||||
Any group that provides the ct_group_ops->commit_item() method has
|
||||
committable items. When this group appears in configfs, mkdir(2) will
|
||||
not work directly in the group. Instead, the group will have two
|
||||
subdirectories: "live" and "pending". The "live" directory does not
|
||||
support mkdir(2) or rmdir(2) either. It only allows rename(2). The
|
||||
"pending" directory does allow mkdir(2) and rmdir(2). An item is
|
||||
created in the "pending" directory. Its attributes can be modified at
|
||||
will. Userspace commits the item by renaming it into the "live"
|
||||
directory. At this point, the subsystem recieves the ->commit_item()
|
||||
callback. If all required attributes are filled to satisfaction, the
|
||||
method returns zero and the item is moved to the "live" directory.
|
||||
|
||||
As rmdir(2) does not work in the "live" directory, an item must be
|
||||
shutdown, or "uncommitted". Again, this is done via rename(2), this
|
||||
time from the "live" directory back to the "pending" one. The subsystem
|
||||
is notified by the ct_group_ops->uncommit_object() method.
|
||||
|
||||
|
474
Documentation/filesystems/configfs/configfs_example.c
Normal file
474
Documentation/filesystems/configfs/configfs_example.c
Normal file
@ -0,0 +1,474 @@
|
||||
/*
|
||||
* vim: noexpandtab ts=8 sts=0 sw=8:
|
||||
*
|
||||
* configfs_example.c - This file is a demonstration module containing
|
||||
* a number of configfs subsystems.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on sysfs:
|
||||
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <linux/configfs.h>
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* 01-childless
|
||||
*
|
||||
* This first example is a childless subsystem. It cannot create
|
||||
* any config_items. It just has attributes.
|
||||
*
|
||||
* Note that we are enclosing the configfs_subsystem inside a container.
|
||||
* This is not necessary if a subsystem has no attributes directly
|
||||
* on the subsystem. See the next example, 02-simple-children, for
|
||||
* such a subsystem.
|
||||
*/
|
||||
|
||||
struct childless {
|
||||
struct configfs_subsystem subsys;
|
||||
int showme;
|
||||
int storeme;
|
||||
};
|
||||
|
||||
struct childless_attribute {
|
||||
struct configfs_attribute attr;
|
||||
ssize_t (*show)(struct childless *, char *);
|
||||
ssize_t (*store)(struct childless *, const char *, size_t);
|
||||
};
|
||||
|
||||
static inline struct childless *to_childless(struct config_item *item)
|
||||
{
|
||||
return item ? container_of(to_configfs_subsystem(to_config_group(item)), struct childless, subsys) : NULL;
|
||||
}
|
||||
|
||||
static ssize_t childless_showme_read(struct childless *childless,
|
||||
char *page)
|
||||
{
|
||||
ssize_t pos;
|
||||
|
||||
pos = sprintf(page, "%d\n", childless->showme);
|
||||
childless->showme++;
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static ssize_t childless_storeme_read(struct childless *childless,
|
||||
char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", childless->storeme);
|
||||
}
|
||||
|
||||
static ssize_t childless_storeme_write(struct childless *childless,
|
||||
const char *page,
|
||||
size_t count)
|
||||
{
|
||||
unsigned long tmp;
|
||||
char *p = (char *) page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 10);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp > INT_MAX)
|
||||
return -ERANGE;
|
||||
|
||||
childless->storeme = tmp;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t childless_description_read(struct childless *childless,
|
||||
char *page)
|
||||
{
|
||||
return sprintf(page,
|
||||
"[01-childless]\n"
|
||||
"\n"
|
||||
"The childless subsystem is the simplest possible subsystem in\n"
|
||||
"configfs. It does not support the creation of child config_items.\n"
|
||||
"It only has a few attributes. In fact, it isn't much different\n"
|
||||
"than a directory in /proc.\n");
|
||||
}
|
||||
|
||||
static struct childless_attribute childless_attr_showme = {
|
||||
.attr = { .ca_owner = THIS_MODULE, .ca_name = "showme", .ca_mode = S_IRUGO },
|
||||
.show = childless_showme_read,
|
||||
};
|
||||
static struct childless_attribute childless_attr_storeme = {
|
||||
.attr = { .ca_owner = THIS_MODULE, .ca_name = "storeme", .ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = childless_storeme_read,
|
||||
.store = childless_storeme_write,
|
||||
};
|
||||
static struct childless_attribute childless_attr_description = {
|
||||
.attr = { .ca_owner = THIS_MODULE, .ca_name = "description", .ca_mode = S_IRUGO },
|
||||
.show = childless_description_read,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *childless_attrs[] = {
|
||||
&childless_attr_showme.attr,
|
||||
&childless_attr_storeme.attr,
|
||||
&childless_attr_description.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static ssize_t childless_attr_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
struct childless *childless = to_childless(item);
|
||||
struct childless_attribute *childless_attr =
|
||||
container_of(attr, struct childless_attribute, attr);
|
||||
ssize_t ret = 0;
|
||||
|
||||
if (childless_attr->show)
|
||||
ret = childless_attr->show(childless, page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t childless_attr_store(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct childless *childless = to_childless(item);
|
||||
struct childless_attribute *childless_attr =
|
||||
container_of(attr, struct childless_attribute, attr);
|
||||
ssize_t ret = -EINVAL;
|
||||
|
||||
if (childless_attr->store)
|
||||
ret = childless_attr->store(childless, page, count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct configfs_item_operations childless_item_ops = {
|
||||
.show_attribute = childless_attr_show,
|
||||
.store_attribute = childless_attr_store,
|
||||
};
|
||||
|
||||
static struct config_item_type childless_type = {
|
||||
.ct_item_ops = &childless_item_ops,
|
||||
.ct_attrs = childless_attrs,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct childless childless_subsys = {
|
||||
.subsys = {
|
||||
.su_group = {
|
||||
.cg_item = {
|
||||
.ci_namebuf = "01-childless",
|
||||
.ci_type = &childless_type,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* 02-simple-children
|
||||
*
|
||||
* This example merely has a simple one-attribute child. Note that
|
||||
* there is no extra attribute structure, as the child's attribute is
|
||||
* known from the get-go. Also, there is no container for the
|
||||
* subsystem, as it has no attributes of its own.
|
||||
*/
|
||||
|
||||
struct simple_child {
|
||||
struct config_item item;
|
||||
int storeme;
|
||||
};
|
||||
|
||||
static inline struct simple_child *to_simple_child(struct config_item *item)
|
||||
{
|
||||
return item ? container_of(item, struct simple_child, item) : NULL;
|
||||
}
|
||||
|
||||
static struct configfs_attribute simple_child_attr_storeme = {
|
||||
.ca_owner = THIS_MODULE,
|
||||
.ca_name = "storeme",
|
||||
.ca_mode = S_IRUGO | S_IWUSR,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *simple_child_attrs[] = {
|
||||
&simple_child_attr_storeme,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static ssize_t simple_child_attr_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
ssize_t count;
|
||||
struct simple_child *simple_child = to_simple_child(item);
|
||||
|
||||
count = sprintf(page, "%d\n", simple_child->storeme);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t simple_child_attr_store(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct simple_child *simple_child = to_simple_child(item);
|
||||
unsigned long tmp;
|
||||
char *p = (char *) page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 10);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp > INT_MAX)
|
||||
return -ERANGE;
|
||||
|
||||
simple_child->storeme = tmp;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static void simple_child_release(struct config_item *item)
|
||||
{
|
||||
kfree(to_simple_child(item));
|
||||
}
|
||||
|
||||
static struct configfs_item_operations simple_child_item_ops = {
|
||||
.release = simple_child_release,
|
||||
.show_attribute = simple_child_attr_show,
|
||||
.store_attribute = simple_child_attr_store,
|
||||
};
|
||||
|
||||
static struct config_item_type simple_child_type = {
|
||||
.ct_item_ops = &simple_child_item_ops,
|
||||
.ct_attrs = simple_child_attrs,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
|
||||
static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
|
||||
{
|
||||
struct simple_child *simple_child;
|
||||
|
||||
simple_child = kmalloc(sizeof(struct simple_child), GFP_KERNEL);
|
||||
if (!simple_child)
|
||||
return NULL;
|
||||
|
||||
memset(simple_child, 0, sizeof(struct simple_child));
|
||||
|
||||
config_item_init_type_name(&simple_child->item, name,
|
||||
&simple_child_type);
|
||||
|
||||
simple_child->storeme = 0;
|
||||
|
||||
return &simple_child->item;
|
||||
}
|
||||
|
||||
static struct configfs_attribute simple_children_attr_description = {
|
||||
.ca_owner = THIS_MODULE,
|
||||
.ca_name = "description",
|
||||
.ca_mode = S_IRUGO,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *simple_children_attrs[] = {
|
||||
&simple_children_attr_description,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static ssize_t simple_children_attr_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
return sprintf(page,
|
||||
"[02-simple-children]\n"
|
||||
"\n"
|
||||
"This subsystem allows the creation of child config_items. These\n"
|
||||
"items have only one attribute that is readable and writeable.\n");
|
||||
}
|
||||
|
||||
static struct configfs_item_operations simple_children_item_ops = {
|
||||
.show_attribute = simple_children_attr_show,
|
||||
};
|
||||
|
||||
/*
|
||||
* Note that, since no extra work is required on ->drop_item(),
|
||||
* no ->drop_item() is provided.
|
||||
*/
|
||||
static struct configfs_group_operations simple_children_group_ops = {
|
||||
.make_item = simple_children_make_item,
|
||||
};
|
||||
|
||||
static struct config_item_type simple_children_type = {
|
||||
.ct_item_ops = &simple_children_item_ops,
|
||||
.ct_group_ops = &simple_children_group_ops,
|
||||
.ct_attrs = simple_children_attrs,
|
||||
};
|
||||
|
||||
static struct configfs_subsystem simple_children_subsys = {
|
||||
.su_group = {
|
||||
.cg_item = {
|
||||
.ci_namebuf = "02-simple-children",
|
||||
.ci_type = &simple_children_type,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* 03-group-children
|
||||
*
|
||||
* This example reuses the simple_children group from above. However,
|
||||
* the simple_children group is not the subsystem itself, it is a
|
||||
* child of the subsystem. Creation of a group in the subsystem creates
|
||||
* a new simple_children group. That group can then have simple_child
|
||||
* children of its own.
|
||||
*/
|
||||
|
||||
struct simple_children {
|
||||
struct config_group group;
|
||||
};
|
||||
|
||||
static struct config_group *group_children_make_group(struct config_group *group, const char *name)
|
||||
{
|
||||
struct simple_children *simple_children;
|
||||
|
||||
simple_children = kmalloc(sizeof(struct simple_children),
|
||||
GFP_KERNEL);
|
||||
if (!simple_children)
|
||||
return NULL;
|
||||
|
||||
memset(simple_children, 0, sizeof(struct simple_children));
|
||||
|
||||
config_group_init_type_name(&simple_children->group, name,
|
||||
&simple_children_type);
|
||||
|
||||
return &simple_children->group;
|
||||
}
|
||||
|
||||
static struct configfs_attribute group_children_attr_description = {
|
||||
.ca_owner = THIS_MODULE,
|
||||
.ca_name = "description",
|
||||
.ca_mode = S_IRUGO,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *group_children_attrs[] = {
|
||||
&group_children_attr_description,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static ssize_t group_children_attr_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
return sprintf(page,
|
||||
"[03-group-children]\n"
|
||||
"\n"
|
||||
"This subsystem allows the creation of child config_groups. These\n"
|
||||
"groups are like the subsystem simple-children.\n");
|
||||
}
|
||||
|
||||
static struct configfs_item_operations group_children_item_ops = {
|
||||
.show_attribute = group_children_attr_show,
|
||||
};
|
||||
|
||||
/*
|
||||
* Note that, since no extra work is required on ->drop_item(),
|
||||
* no ->drop_item() is provided.
|
||||
*/
|
||||
static struct configfs_group_operations group_children_group_ops = {
|
||||
.make_group = group_children_make_group,
|
||||
};
|
||||
|
||||
static struct config_item_type group_children_type = {
|
||||
.ct_item_ops = &group_children_item_ops,
|
||||
.ct_group_ops = &group_children_group_ops,
|
||||
.ct_attrs = group_children_attrs,
|
||||
};
|
||||
|
||||
static struct configfs_subsystem group_children_subsys = {
|
||||
.su_group = {
|
||||
.cg_item = {
|
||||
.ci_namebuf = "03-group-children",
|
||||
.ci_type = &group_children_type,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
/* ----------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* We're now done with our subsystem definitions.
|
||||
* For convenience in this module, here's a list of them all. It
|
||||
* allows the init function to easily register them. Most modules
|
||||
* will only have one subsystem, and will only call register_subsystem
|
||||
* on it directly.
|
||||
*/
|
||||
static struct configfs_subsystem *example_subsys[] = {
|
||||
&childless_subsys.subsys,
|
||||
&simple_children_subsys,
|
||||
&group_children_subsys,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int __init configfs_example_init(void)
|
||||
{
|
||||
int ret;
|
||||
int i;
|
||||
struct configfs_subsystem *subsys;
|
||||
|
||||
for (i = 0; example_subsys[i]; i++) {
|
||||
subsys = example_subsys[i];
|
||||
|
||||
config_group_init(&subsys->su_group);
|
||||
init_MUTEX(&subsys->su_sem);
|
||||
ret = configfs_register_subsystem(subsys);
|
||||
if (ret) {
|
||||
printk(KERN_ERR "Error %d while registering subsystem %s\n",
|
||||
ret,
|
||||
subsys->su_group.cg_item.ci_namebuf);
|
||||
goto out_unregister;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
out_unregister:
|
||||
for (; i >= 0; i--) {
|
||||
configfs_unregister_subsystem(example_subsys[i]);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit configfs_example_exit(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; example_subsys[i]; i++) {
|
||||
configfs_unregister_subsystem(example_subsys[i]);
|
||||
}
|
||||
}
|
||||
|
||||
module_init(configfs_example_init);
|
||||
module_exit(configfs_example_exit);
|
||||
MODULE_LICENSE("GPL");
|
130
Documentation/filesystems/dlmfs.txt
Normal file
130
Documentation/filesystems/dlmfs.txt
Normal file
@ -0,0 +1,130 @@
|
||||
dlmfs
|
||||
==================
|
||||
A minimal DLM userspace interface implemented via a virtual file
|
||||
system.
|
||||
|
||||
dlmfs is built with OCFS2 as it requires most of its infrastructure.
|
||||
|
||||
Project web page: http://oss.oracle.com/projects/ocfs2
|
||||
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
|
||||
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
|
||||
|
||||
All code copyright 2005 Oracle except when otherwise noted.
|
||||
|
||||
CREDITS
|
||||
=======
|
||||
|
||||
Some code taken from ramfs which is Copyright (C) 2000 Linus Torvalds
|
||||
and Transmeta Corp.
|
||||
|
||||
Mark Fasheh <mark.fasheh@oracle.com>
|
||||
|
||||
Caveats
|
||||
=======
|
||||
- Right now it only works with the OCFS2 DLM, though support for other
|
||||
DLM implementations should not be a major issue.
|
||||
|
||||
Mount options
|
||||
=============
|
||||
None
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
If you're just interested in OCFS2, then please see ocfs2.txt. The
|
||||
rest of this document will be geared towards those who want to use
|
||||
dlmfs for easy to setup and easy to use clustered locking in
|
||||
userspace.
|
||||
|
||||
Setup
|
||||
=====
|
||||
|
||||
dlmfs requires that the OCFS2 cluster infrastructure be in
|
||||
place. Please download ocfs2-tools from the above url and configure a
|
||||
cluster.
|
||||
|
||||
You'll want to start heartbeating on a volume which all the nodes in
|
||||
your lockspace can access. The easiest way to do this is via
|
||||
ocfs2_hb_ctl (distributed with ocfs2-tools). Right now it requires
|
||||
that an OCFS2 file system be in place so that it can automatically
|
||||
find it's heartbeat area, though it will eventually support heartbeat
|
||||
against raw disks.
|
||||
|
||||
Please see the ocfs2_hb_ctl and mkfs.ocfs2 manual pages distributed
|
||||
with ocfs2-tools.
|
||||
|
||||
Once you're heartbeating, DLM lock 'domains' can be easily created /
|
||||
destroyed and locks within them accessed.
|
||||
|
||||
Locking
|
||||
=======
|
||||
|
||||
Users may access dlmfs via standard file system calls, or they can use
|
||||
'libo2dlm' (distributed with ocfs2-tools) which abstracts the file
|
||||
system calls and presents a more traditional locking api.
|
||||
|
||||
dlmfs handles lock caching automatically for the user, so a lock
|
||||
request for an already acquired lock will not generate another DLM
|
||||
call. Userspace programs are assumed to handle their own local
|
||||
locking.
|
||||
|
||||
Two levels of locks are supported - Shared Read, and Exlcusive.
|
||||
Also supported is a Trylock operation.
|
||||
|
||||
For information on the libo2dlm interface, please see o2dlm.h,
|
||||
distributed with ocfs2-tools.
|
||||
|
||||
Lock value blocks can be read and written to a resource via read(2)
|
||||
and write(2) against the fd obtained via your open(2) call. The
|
||||
maximum currently supported LVB length is 64 bytes (though that is an
|
||||
OCFS2 DLM limitation). Through this mechanism, users of dlmfs can share
|
||||
small amounts of data amongst their nodes.
|
||||
|
||||
mkdir(2) signals dlmfs to join a domain (which will have the same name
|
||||
as the resulting directory)
|
||||
|
||||
rmdir(2) signals dlmfs to leave the domain
|
||||
|
||||
Locks for a given domain are represented by regular inodes inside the
|
||||
domain directory. Locking against them is done via the open(2) system
|
||||
call.
|
||||
|
||||
The open(2) call will not return until your lock has been granted or
|
||||
an error has occurred, unless it has been instructed to do a trylock
|
||||
operation. If the lock succeeds, you'll get an fd.
|
||||
|
||||
open(2) with O_CREAT to ensure the resource inode is created - dlmfs does
|
||||
not automatically create inodes for existing lock resources.
|
||||
|
||||
Open Flag Lock Request Type
|
||||
--------- -----------------
|
||||
O_RDONLY Shared Read
|
||||
O_RDWR Exclusive
|
||||
|
||||
Open Flag Resulting Locking Behavior
|
||||
--------- --------------------------
|
||||
O_NONBLOCK Trylock operation
|
||||
|
||||
You must provide exactly one of O_RDONLY or O_RDWR.
|
||||
|
||||
If O_NONBLOCK is also provided and the trylock operation was valid but
|
||||
could not lock the resource then open(2) will return ETXTBUSY.
|
||||
|
||||
close(2) drops the lock associated with your fd.
|
||||
|
||||
Modes passed to mkdir(2) or open(2) are adhered to locally. Chown is
|
||||
supported locally as well. This means you can use them to restrict
|
||||
access to the resources via dlmfs on your local node only.
|
||||
|
||||
The resource LVB may be read from the fd in either Shared Read or
|
||||
Exclusive modes via the read(2) system call. It can be written via
|
||||
write(2) only when open in Exclusive mode.
|
||||
|
||||
Once written, an LVB will be visible to other nodes who obtain Read
|
||||
Only or higher level locks on the resource.
|
||||
|
||||
See Also
|
||||
========
|
||||
http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
|
||||
|
||||
For more information on the VMS distributed locking API.
|
55
Documentation/filesystems/ocfs2.txt
Normal file
55
Documentation/filesystems/ocfs2.txt
Normal file
@ -0,0 +1,55 @@
|
||||
OCFS2 filesystem
|
||||
==================
|
||||
OCFS2 is a general purpose extent based shared disk cluster file
|
||||
system with many similarities to ext3. It supports 64 bit inode
|
||||
numbers, and has automatically extending metadata groups which may
|
||||
also make it attractive for non-clustered use.
|
||||
|
||||
You'll want to install the ocfs2-tools package in order to at least
|
||||
get "mount.ocfs2" and "ocfs2_hb_ctl".
|
||||
|
||||
Project web page: http://oss.oracle.com/projects/ocfs2
|
||||
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
|
||||
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
|
||||
|
||||
All code copyright 2005 Oracle except when otherwise noted.
|
||||
|
||||
CREDITS:
|
||||
Lots of code taken from ext3 and other projects.
|
||||
|
||||
Authors in alphabetical order:
|
||||
Joel Becker <joel.becker@oracle.com>
|
||||
Zach Brown <zach.brown@oracle.com>
|
||||
Mark Fasheh <mark.fasheh@oracle.com>
|
||||
Kurt Hackel <kurt.hackel@oracle.com>
|
||||
Sunil Mushran <sunil.mushran@oracle.com>
|
||||
Manish Singh <manish.singh@oracle.com>
|
||||
|
||||
Caveats
|
||||
=======
|
||||
Features which OCFS2 does not support yet:
|
||||
- sparse files
|
||||
- extended attributes
|
||||
- shared writeable mmap
|
||||
- loopback is supported, but data written will not
|
||||
be cluster coherent.
|
||||
- quotas
|
||||
- cluster aware flock
|
||||
- Directory change notification (F_NOTIFY)
|
||||
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
|
||||
- POSIX ACLs
|
||||
- readpages / writepages (not user visible)
|
||||
|
||||
Mount options
|
||||
=============
|
||||
|
||||
OCFS2 supports the following mount options:
|
||||
(*) == default
|
||||
|
||||
barrier=1 This enables/disables barriers. barrier=0 disables it,
|
||||
barrier=1 enables it.
|
||||
errors=remount-ro(*) Remount the filesystem read-only on an error.
|
||||
errors=panic Panic and halt the machine if an error occurs.
|
||||
intr (*) Allow signals to interrupt cluster operations.
|
||||
nointr Do not allow signals to interrupt cluster
|
||||
operations.
|
14
MAINTAINERS
14
MAINTAINERS
@ -554,6 +554,11 @@ W: http://us1.samba.org/samba/Linux_CIFS_client.html
|
||||
T: git kernel.org:/pub/scm/linux/kernel/git/sfrench/cifs-2.6.git
|
||||
S: Supported
|
||||
|
||||
CONFIGFS
|
||||
P: Joel Becker
|
||||
M: Joel Becker <joel.becker@oracle.com>
|
||||
S: Supported
|
||||
|
||||
CIRRUS LOGIC GENERIC FBDEV DRIVER
|
||||
P: Jeff Garzik
|
||||
M: jgarzik@pobox.com
|
||||
@ -1898,6 +1903,15 @@ M: ajoshi@shell.unixbox.com
|
||||
L: linux-nvidia@lists.surfsouth.com
|
||||
S: Maintained
|
||||
|
||||
ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
|
||||
P: Mark Fasheh
|
||||
M: mark.fasheh@oracle.com
|
||||
P: Kurt Hackel
|
||||
M: kurt.hackel@oracle.com
|
||||
L: ocfs2-devel@oss.oracle.com
|
||||
W: http://oss.oracle.com/projects/ocfs2/
|
||||
S: Supported
|
||||
|
||||
OLYMPIC NETWORK DRIVER
|
||||
P: Peter De Shrijver
|
||||
M: p2@ace.ulyssis.student.kuleuven.ac.be
|
||||
|
@ -213,7 +213,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
|
||||
struct address_space_operations *aops = mapping->a_ops;
|
||||
pgoff_t index;
|
||||
unsigned offset, bv_offs;
|
||||
int len, ret = 0;
|
||||
int len, ret;
|
||||
|
||||
down(&mapping->host->i_sem);
|
||||
index = pos >> PAGE_CACHE_SHIFT;
|
||||
@ -232,9 +232,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
|
||||
page = grab_cache_page(mapping, index);
|
||||
if (unlikely(!page))
|
||||
goto fail;
|
||||
if (unlikely(aops->prepare_write(file, page, offset,
|
||||
offset + size)))
|
||||
ret = aops->prepare_write(file, page, offset,
|
||||
offset + size);
|
||||
if (unlikely(ret)) {
|
||||
if (ret == AOP_TRUNCATED_PAGE) {
|
||||
page_cache_release(page);
|
||||
continue;
|
||||
}
|
||||
goto unlock;
|
||||
}
|
||||
transfer_result = lo_do_transfer(lo, WRITE, page, offset,
|
||||
bvec->bv_page, bv_offs, size, IV);
|
||||
if (unlikely(transfer_result)) {
|
||||
@ -251,9 +257,15 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
}
|
||||
flush_dcache_page(page);
|
||||
if (unlikely(aops->commit_write(file, page, offset,
|
||||
offset + size)))
|
||||
ret = aops->commit_write(file, page, offset,
|
||||
offset + size);
|
||||
if (unlikely(ret)) {
|
||||
if (ret == AOP_TRUNCATED_PAGE) {
|
||||
page_cache_release(page);
|
||||
continue;
|
||||
}
|
||||
goto unlock;
|
||||
}
|
||||
if (unlikely(transfer_result))
|
||||
goto unlock;
|
||||
bv_offs += size;
|
||||
@ -264,6 +276,7 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
}
|
||||
ret = 0;
|
||||
out:
|
||||
up(&mapping->host->i_sem);
|
||||
return ret;
|
||||
|
@ -154,7 +154,7 @@ static int ramdisk_commit_write(struct file *file, struct page *page,
|
||||
|
||||
/*
|
||||
* ->writepage to the the blockdev's mapping has to redirty the page so that the
|
||||
* VM doesn't go and steal it. We return WRITEPAGE_ACTIVATE so that the VM
|
||||
* VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
|
||||
* won't try to (pointlessly) write the page again for a while.
|
||||
*
|
||||
* Really, these pages should not be on the LRU at all.
|
||||
@ -165,7 +165,7 @@ static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
|
||||
make_page_uptodate(page);
|
||||
SetPageDirty(page);
|
||||
if (wbc->for_reclaim)
|
||||
return WRITEPAGE_ACTIVATE;
|
||||
return AOP_WRITEPAGE_ACTIVATE;
|
||||
unlock_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
66
fs/Kconfig
66
fs/Kconfig
@ -70,6 +70,7 @@ config FS_XIP
|
||||
|
||||
config EXT3_FS
|
||||
tristate "Ext3 journalling file system support"
|
||||
select JBD
|
||||
help
|
||||
This is the journaling version of the Second extended file system
|
||||
(often called ext3), the de facto standard Linux file system
|
||||
@ -138,23 +139,20 @@ config EXT3_FS_SECURITY
|
||||
extended attributes for file security labels, say N.
|
||||
|
||||
config JBD
|
||||
# CONFIG_JBD could be its own option (even modular), but until there are
|
||||
# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
|
||||
# dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
|
||||
tristate
|
||||
default EXT3_FS
|
||||
help
|
||||
This is a generic journaling layer for block devices. It is
|
||||
currently used by the ext3 file system, but it could also be used to
|
||||
add journal support to other file systems or block devices such as
|
||||
RAID or LVM.
|
||||
currently used by the ext3 and OCFS2 file systems, but it could
|
||||
also be used to add journal support to other file systems or block
|
||||
devices such as RAID or LVM.
|
||||
|
||||
If you are using the ext3 file system, you need to say Y here. If
|
||||
you are not using ext3 then you will probably want to say N.
|
||||
If you are using the ext3 or OCFS2 file systems, you need to
|
||||
say Y here. If you are not using ext3 OCFS2 then you will probably
|
||||
want to say N.
|
||||
|
||||
To compile this device as a module, choose M here: the module will be
|
||||
called jbd. If you are compiling ext3 into the kernel, you cannot
|
||||
compile this code as a module.
|
||||
called jbd. If you are compiling ext3 or OCFS2 into the kernel,
|
||||
you cannot compile this code as a module.
|
||||
|
||||
config JBD_DEBUG
|
||||
bool "JBD (ext3) debugging support"
|
||||
@ -326,6 +324,38 @@ config FS_POSIX_ACL
|
||||
|
||||
source "fs/xfs/Kconfig"
|
||||
|
||||
config OCFS2_FS
|
||||
tristate "OCFS2 file system support (EXPERIMENTAL)"
|
||||
depends on NET && EXPERIMENTAL
|
||||
select CONFIGFS_FS
|
||||
select JBD
|
||||
select CRC32
|
||||
select INET
|
||||
help
|
||||
OCFS2 is a general purpose extent based shared disk cluster file
|
||||
system with many similarities to ext3. It supports 64 bit inode
|
||||
numbers, and has automatically extending metadata groups which may
|
||||
also make it attractive for non-clustered use.
|
||||
|
||||
You'll want to install the ocfs2-tools package in order to at least
|
||||
get "mount.ocfs2".
|
||||
|
||||
Project web page: http://oss.oracle.com/projects/ocfs2
|
||||
Tools web page: http://oss.oracle.com/projects/ocfs2-tools
|
||||
OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
|
||||
|
||||
Note: Features which OCFS2 does not support yet:
|
||||
- extended attributes
|
||||
- shared writeable mmap
|
||||
- loopback is supported, but data written will not
|
||||
be cluster coherent.
|
||||
- quotas
|
||||
- cluster aware flock
|
||||
- Directory change notification (F_NOTIFY)
|
||||
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
|
||||
- POSIX ACLs
|
||||
- readpages / writepages (not user visible)
|
||||
|
||||
config MINIX_FS
|
||||
tristate "Minix fs support"
|
||||
help
|
||||
@ -841,6 +871,20 @@ config RELAYFS_FS
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config CONFIGFS_FS
|
||||
tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
|
||||
depends on EXPERIMENTAL
|
||||
help
|
||||
configfs is a ram-based filesystem that provides the converse
|
||||
of sysfs's functionality. Where sysfs is a filesystem-based
|
||||
view of kernel objects, configfs is a filesystem-based manager
|
||||
of kernel objects, or config_items.
|
||||
|
||||
Both sysfs and configfs can and should exist together on the
|
||||
same system. One is not a replacement for the other.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
endmenu
|
||||
|
||||
menu "Miscellaneous filesystems"
|
||||
|
@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS) += befs/
|
||||
obj-$(CONFIG_HOSTFS) += hostfs/
|
||||
obj-$(CONFIG_HPPFS) += hppfs/
|
||||
obj-$(CONFIG_DEBUG_FS) += debugfs/
|
||||
obj-$(CONFIG_CONFIGFS_FS) += configfs/
|
||||
obj-$(CONFIG_OCFS2_FS) += ocfs2/
|
||||
|
7
fs/configfs/Makefile
Normal file
7
fs/configfs/Makefile
Normal file
@ -0,0 +1,7 @@
|
||||
#
|
||||
# Makefile for the configfs virtual filesystem
|
||||
#
|
||||
|
||||
obj-$(CONFIG_CONFIGFS_FS) += configfs.o
|
||||
|
||||
configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o
|
142
fs/configfs/configfs_internal.h
Normal file
142
fs/configfs/configfs_internal.h
Normal file
@ -0,0 +1,142 @@
|
||||
/* -*- mode: c; c-basic-offset:8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* configfs_internal.h - Internal stuff for configfs
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on sysfs:
|
||||
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/list.h>
|
||||
|
||||
struct configfs_dirent {
|
||||
atomic_t s_count;
|
||||
struct list_head s_sibling;
|
||||
struct list_head s_children;
|
||||
struct list_head s_links;
|
||||
void * s_element;
|
||||
int s_type;
|
||||
umode_t s_mode;
|
||||
struct dentry * s_dentry;
|
||||
};
|
||||
|
||||
#define CONFIGFS_ROOT 0x0001
|
||||
#define CONFIGFS_DIR 0x0002
|
||||
#define CONFIGFS_ITEM_ATTR 0x0004
|
||||
#define CONFIGFS_ITEM_LINK 0x0020
|
||||
#define CONFIGFS_USET_DIR 0x0040
|
||||
#define CONFIGFS_USET_DEFAULT 0x0080
|
||||
#define CONFIGFS_USET_DROPPING 0x0100
|
||||
#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
|
||||
|
||||
extern struct vfsmount * configfs_mount;
|
||||
|
||||
extern int configfs_is_root(struct config_item *item);
|
||||
|
||||
extern struct inode * configfs_new_inode(mode_t mode);
|
||||
extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *));
|
||||
|
||||
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
|
||||
extern int configfs_make_dirent(struct configfs_dirent *,
|
||||
struct dentry *, void *, umode_t, int);
|
||||
|
||||
extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
|
||||
extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
|
||||
|
||||
extern const unsigned char * configfs_get_name(struct configfs_dirent *sd);
|
||||
extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent);
|
||||
|
||||
extern int configfs_pin_fs(void);
|
||||
extern void configfs_release_fs(void);
|
||||
|
||||
extern struct rw_semaphore configfs_rename_sem;
|
||||
extern struct super_block * configfs_sb;
|
||||
extern struct file_operations configfs_dir_operations;
|
||||
extern struct file_operations configfs_file_operations;
|
||||
extern struct file_operations bin_fops;
|
||||
extern struct inode_operations configfs_dir_inode_operations;
|
||||
extern struct inode_operations configfs_symlink_inode_operations;
|
||||
|
||||
extern int configfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *symname);
|
||||
extern int configfs_unlink(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
struct configfs_symlink {
|
||||
struct list_head sl_list;
|
||||
struct config_item *sl_target;
|
||||
};
|
||||
|
||||
extern int configfs_create_link(struct configfs_symlink *sl,
|
||||
struct dentry *parent,
|
||||
struct dentry *dentry);
|
||||
|
||||
static inline struct config_item * to_item(struct dentry * dentry)
|
||||
{
|
||||
struct configfs_dirent * sd = dentry->d_fsdata;
|
||||
return ((struct config_item *) sd->s_element);
|
||||
}
|
||||
|
||||
static inline struct configfs_attribute * to_attr(struct dentry * dentry)
|
||||
{
|
||||
struct configfs_dirent * sd = dentry->d_fsdata;
|
||||
return ((struct configfs_attribute *) sd->s_element);
|
||||
}
|
||||
|
||||
static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
|
||||
{
|
||||
struct config_item * item = NULL;
|
||||
|
||||
spin_lock(&dcache_lock);
|
||||
if (!d_unhashed(dentry)) {
|
||||
struct configfs_dirent * sd = dentry->d_fsdata;
|
||||
if (sd->s_type & CONFIGFS_ITEM_LINK) {
|
||||
struct configfs_symlink * sl = sd->s_element;
|
||||
item = config_item_get(sl->sl_target);
|
||||
} else
|
||||
item = config_item_get(sd->s_element);
|
||||
}
|
||||
spin_unlock(&dcache_lock);
|
||||
|
||||
return item;
|
||||
}
|
||||
|
||||
static inline void release_configfs_dirent(struct configfs_dirent * sd)
|
||||
{
|
||||
if (!(sd->s_type & CONFIGFS_ROOT))
|
||||
kfree(sd);
|
||||
}
|
||||
|
||||
static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd)
|
||||
{
|
||||
if (sd) {
|
||||
WARN_ON(!atomic_read(&sd->s_count));
|
||||
atomic_inc(&sd->s_count);
|
||||
}
|
||||
return sd;
|
||||
}
|
||||
|
||||
static inline void configfs_put(struct configfs_dirent * sd)
|
||||
{
|
||||
WARN_ON(!atomic_read(&sd->s_count));
|
||||
if (atomic_dec_and_test(&sd->s_count))
|
||||
release_configfs_dirent(sd);
|
||||
}
|
||||
|
1102
fs/configfs/dir.c
Normal file
1102
fs/configfs/dir.c
Normal file
File diff suppressed because it is too large
Load Diff
360
fs/configfs/file.c
Normal file
360
fs/configfs/file.c
Normal file
@ -0,0 +1,360 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* file.c - operations for regular (text) files.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on sysfs:
|
||||
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/dnotify.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/semaphore.h>
|
||||
|
||||
#include <linux/configfs.h>
|
||||
#include "configfs_internal.h"
|
||||
|
||||
|
||||
struct configfs_buffer {
|
||||
size_t count;
|
||||
loff_t pos;
|
||||
char * page;
|
||||
struct configfs_item_operations * ops;
|
||||
struct semaphore sem;
|
||||
int needs_read_fill;
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* fill_read_buffer - allocate and fill buffer from item.
|
||||
* @dentry: dentry pointer.
|
||||
* @buffer: data buffer for file.
|
||||
*
|
||||
* Allocate @buffer->page, if it hasn't been already, then call the
|
||||
* config_item's show() method to fill the buffer with this attribute's
|
||||
* data.
|
||||
* This is called only once, on the file's first read.
|
||||
*/
|
||||
static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer)
|
||||
{
|
||||
struct configfs_attribute * attr = to_attr(dentry);
|
||||
struct config_item * item = to_item(dentry->d_parent);
|
||||
struct configfs_item_operations * ops = buffer->ops;
|
||||
int ret = 0;
|
||||
ssize_t count;
|
||||
|
||||
if (!buffer->page)
|
||||
buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
|
||||
if (!buffer->page)
|
||||
return -ENOMEM;
|
||||
|
||||
count = ops->show_attribute(item,attr,buffer->page);
|
||||
buffer->needs_read_fill = 0;
|
||||
BUG_ON(count > (ssize_t)PAGE_SIZE);
|
||||
if (count >= 0)
|
||||
buffer->count = count;
|
||||
else
|
||||
ret = count;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* flush_read_buffer - push buffer to userspace.
|
||||
* @buffer: data buffer for file.
|
||||
* @userbuf: user-passed buffer.
|
||||
* @count: number of bytes requested.
|
||||
* @ppos: file position.
|
||||
*
|
||||
* Copy the buffer we filled in fill_read_buffer() to userspace.
|
||||
* This is done at the reader's leisure, copying and advancing
|
||||
* the amount they specify each time.
|
||||
* This may be called continuously until the buffer is empty.
|
||||
*/
|
||||
static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf,
|
||||
size_t count, loff_t * ppos)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (*ppos > buffer->count)
|
||||
return 0;
|
||||
|
||||
if (count > (buffer->count - *ppos))
|
||||
count = buffer->count - *ppos;
|
||||
|
||||
error = copy_to_user(buf,buffer->page + *ppos,count);
|
||||
if (!error)
|
||||
*ppos += count;
|
||||
return error ? -EFAULT : count;
|
||||
}
|
||||
|
||||
/**
|
||||
* configfs_read_file - read an attribute.
|
||||
* @file: file pointer.
|
||||
* @buf: buffer to fill.
|
||||
* @count: number of bytes to read.
|
||||
* @ppos: starting offset in file.
|
||||
*
|
||||
* Userspace wants to read an attribute file. The attribute descriptor
|
||||
* is in the file's ->d_fsdata. The target item is in the directory's
|
||||
* ->d_fsdata.
|
||||
*
|
||||
* We call fill_read_buffer() to allocate and fill the buffer from the
|
||||
* item's show() method exactly once (if the read is happening from
|
||||
* the beginning of the file). That should fill the entire buffer with
|
||||
* all the data the item has to offer for that attribute.
|
||||
* We then call flush_read_buffer() to copy the buffer to userspace
|
||||
* in the increments specified.
|
||||
*/
|
||||
|
||||
static ssize_t
|
||||
configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
struct configfs_buffer * buffer = file->private_data;
|
||||
ssize_t retval = 0;
|
||||
|
||||
down(&buffer->sem);
|
||||
if (buffer->needs_read_fill) {
|
||||
if ((retval = fill_read_buffer(file->f_dentry,buffer)))
|
||||
goto out;
|
||||
}
|
||||
pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
|
||||
__FUNCTION__,count,*ppos,buffer->page);
|
||||
retval = flush_read_buffer(buffer,buf,count,ppos);
|
||||
out:
|
||||
up(&buffer->sem);
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* fill_write_buffer - copy buffer from userspace.
|
||||
* @buffer: data buffer for file.
|
||||
* @userbuf: data from user.
|
||||
* @count: number of bytes in @userbuf.
|
||||
*
|
||||
* Allocate @buffer->page if it hasn't been already, then
|
||||
* copy the user-supplied buffer into it.
|
||||
*/
|
||||
|
||||
static int
|
||||
fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (!buffer->page)
|
||||
buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
|
||||
if (!buffer->page)
|
||||
return -ENOMEM;
|
||||
|
||||
if (count > PAGE_SIZE)
|
||||
count = PAGE_SIZE;
|
||||
error = copy_from_user(buffer->page,buf,count);
|
||||
buffer->needs_read_fill = 1;
|
||||
return error ? -EFAULT : count;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* flush_write_buffer - push buffer to config_item.
|
||||
* @file: file pointer.
|
||||
* @buffer: data buffer for file.
|
||||
*
|
||||
* Get the correct pointers for the config_item and the attribute we're
|
||||
* dealing with, then call the store() method for the attribute,
|
||||
* passing the buffer that we acquired in fill_write_buffer().
|
||||
*/
|
||||
|
||||
static int
|
||||
flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count)
|
||||
{
|
||||
struct configfs_attribute * attr = to_attr(dentry);
|
||||
struct config_item * item = to_item(dentry->d_parent);
|
||||
struct configfs_item_operations * ops = buffer->ops;
|
||||
|
||||
return ops->store_attribute(item,attr,buffer->page,count);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* configfs_write_file - write an attribute.
|
||||
* @file: file pointer
|
||||
* @buf: data to write
|
||||
* @count: number of bytes
|
||||
* @ppos: starting offset
|
||||
*
|
||||
* Similar to configfs_read_file(), though working in the opposite direction.
|
||||
* We allocate and fill the data from the user in fill_write_buffer(),
|
||||
* then push it to the config_item in flush_write_buffer().
|
||||
* There is no easy way for us to know if userspace is only doing a partial
|
||||
* write, so we don't support them. We expect the entire buffer to come
|
||||
* on the first write.
|
||||
* Hint: if you're writing a value, first read the file, modify only the
|
||||
* the value you're changing, then write entire buffer back.
|
||||
*/
|
||||
|
||||
static ssize_t
|
||||
configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
struct configfs_buffer * buffer = file->private_data;
|
||||
|
||||
down(&buffer->sem);
|
||||
count = fill_write_buffer(buffer,buf,count);
|
||||
if (count > 0)
|
||||
count = flush_write_buffer(file->f_dentry,buffer,count);
|
||||
if (count > 0)
|
||||
*ppos += count;
|
||||
up(&buffer->sem);
|
||||
return count;
|
||||
}
|
||||
|
||||
static int check_perm(struct inode * inode, struct file * file)
|
||||
{
|
||||
struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent);
|
||||
struct configfs_attribute * attr = to_attr(file->f_dentry);
|
||||
struct configfs_buffer * buffer;
|
||||
struct configfs_item_operations * ops = NULL;
|
||||
int error = 0;
|
||||
|
||||
if (!item || !attr)
|
||||
goto Einval;
|
||||
|
||||
/* Grab the module reference for this attribute if we have one */
|
||||
if (!try_module_get(attr->ca_owner)) {
|
||||
error = -ENODEV;
|
||||
goto Done;
|
||||
}
|
||||
|
||||
if (item->ci_type)
|
||||
ops = item->ci_type->ct_item_ops;
|
||||
else
|
||||
goto Eaccess;
|
||||
|
||||
/* File needs write support.
|
||||
* The inode's perms must say it's ok,
|
||||
* and we must have a store method.
|
||||
*/
|
||||
if (file->f_mode & FMODE_WRITE) {
|
||||
|
||||
if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute)
|
||||
goto Eaccess;
|
||||
|
||||
}
|
||||
|
||||
/* File needs read support.
|
||||
* The inode's perms must say it's ok, and we there
|
||||
* must be a show method for it.
|
||||
*/
|
||||
if (file->f_mode & FMODE_READ) {
|
||||
if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute)
|
||||
goto Eaccess;
|
||||
}
|
||||
|
||||
/* No error? Great, allocate a buffer for the file, and store it
|
||||
* it in file->private_data for easy access.
|
||||
*/
|
||||
buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
|
||||
if (buffer) {
|
||||
memset(buffer,0,sizeof(struct configfs_buffer));
|
||||
init_MUTEX(&buffer->sem);
|
||||
buffer->needs_read_fill = 1;
|
||||
buffer->ops = ops;
|
||||
file->private_data = buffer;
|
||||
} else
|
||||
error = -ENOMEM;
|
||||
goto Done;
|
||||
|
||||
Einval:
|
||||
error = -EINVAL;
|
||||
goto Done;
|
||||
Eaccess:
|
||||
error = -EACCES;
|
||||
module_put(attr->ca_owner);
|
||||
Done:
|
||||
if (error && item)
|
||||
config_item_put(item);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int configfs_open_file(struct inode * inode, struct file * filp)
|
||||
{
|
||||
return check_perm(inode,filp);
|
||||
}
|
||||
|
||||
static int configfs_release(struct inode * inode, struct file * filp)
|
||||
{
|
||||
struct config_item * item = to_item(filp->f_dentry->d_parent);
|
||||
struct configfs_attribute * attr = to_attr(filp->f_dentry);
|
||||
struct module * owner = attr->ca_owner;
|
||||
struct configfs_buffer * buffer = filp->private_data;
|
||||
|
||||
if (item)
|
||||
config_item_put(item);
|
||||
/* After this point, attr should not be accessed. */
|
||||
module_put(owner);
|
||||
|
||||
if (buffer) {
|
||||
if (buffer->page)
|
||||
free_page((unsigned long)buffer->page);
|
||||
kfree(buffer);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct file_operations configfs_file_operations = {
|
||||
.read = configfs_read_file,
|
||||
.write = configfs_write_file,
|
||||
.llseek = generic_file_llseek,
|
||||
.open = configfs_open_file,
|
||||
.release = configfs_release,
|
||||
};
|
||||
|
||||
|
||||
int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type)
|
||||
{
|
||||
struct configfs_dirent * parent_sd = dir->d_fsdata;
|
||||
umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
|
||||
int error = 0;
|
||||
|
||||
down(&dir->d_inode->i_sem);
|
||||
error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type);
|
||||
up(&dir->d_inode->i_sem);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* configfs_create_file - create an attribute file for an item.
|
||||
* @item: item we're creating for.
|
||||
* @attr: atrribute descriptor.
|
||||
*/
|
||||
|
||||
int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr)
|
||||
{
|
||||
BUG_ON(!item || !item->ci_dentry || !attr);
|
||||
|
||||
return configfs_add_file(item->ci_dentry, attr,
|
||||
CONFIGFS_ITEM_ATTR);
|
||||
}
|
||||
|
162
fs/configfs/inode.c
Normal file
162
fs/configfs/inode.c
Normal file
@ -0,0 +1,162 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* inode.c - basic inode and dentry operations.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on sysfs:
|
||||
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* Please see Documentation/filesystems/configfs.txt for more information.
|
||||
*/
|
||||
|
||||
#undef DEBUG
|
||||
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/backing-dev.h>
|
||||
|
||||
#include <linux/configfs.h>
|
||||
#include "configfs_internal.h"
|
||||
|
||||
extern struct super_block * configfs_sb;
|
||||
|
||||
static struct address_space_operations configfs_aops = {
|
||||
.readpage = simple_readpage,
|
||||
.prepare_write = simple_prepare_write,
|
||||
.commit_write = simple_commit_write
|
||||
};
|
||||
|
||||
static struct backing_dev_info configfs_backing_dev_info = {
|
||||
.ra_pages = 0, /* No readahead */
|
||||
.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
|
||||
};
|
||||
|
||||
struct inode * configfs_new_inode(mode_t mode)
|
||||
{
|
||||
struct inode * inode = new_inode(configfs_sb);
|
||||
if (inode) {
|
||||
inode->i_mode = mode;
|
||||
inode->i_uid = 0;
|
||||
inode->i_gid = 0;
|
||||
inode->i_blksize = PAGE_CACHE_SIZE;
|
||||
inode->i_blocks = 0;
|
||||
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_mapping->a_ops = &configfs_aops;
|
||||
inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
|
||||
}
|
||||
return inode;
|
||||
}
|
||||
|
||||
int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *))
|
||||
{
|
||||
int error = 0;
|
||||
struct inode * inode = NULL;
|
||||
if (dentry) {
|
||||
if (!dentry->d_inode) {
|
||||
if ((inode = configfs_new_inode(mode))) {
|
||||
if (dentry->d_parent && dentry->d_parent->d_inode) {
|
||||
struct inode *p_inode = dentry->d_parent->d_inode;
|
||||
p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME;
|
||||
}
|
||||
goto Proceed;
|
||||
}
|
||||
else
|
||||
error = -ENOMEM;
|
||||
} else
|
||||
error = -EEXIST;
|
||||
} else
|
||||
error = -ENOENT;
|
||||
goto Done;
|
||||
|
||||
Proceed:
|
||||
if (init)
|
||||
error = init(inode);
|
||||
if (!error) {
|
||||
d_instantiate(dentry, inode);
|
||||
if (S_ISDIR(mode) || S_ISLNK(mode))
|
||||
dget(dentry); /* pin link and directory dentries in core */
|
||||
} else
|
||||
iput(inode);
|
||||
Done:
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the name for corresponding element represented by the given configfs_dirent
|
||||
*/
|
||||
const unsigned char * configfs_get_name(struct configfs_dirent *sd)
|
||||
{
|
||||
struct attribute * attr;
|
||||
|
||||
if (!sd || !sd->s_element)
|
||||
BUG();
|
||||
|
||||
/* These always have a dentry, so use that */
|
||||
if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
|
||||
return sd->s_dentry->d_name.name;
|
||||
|
||||
if (sd->s_type & CONFIGFS_ITEM_ATTR) {
|
||||
attr = sd->s_element;
|
||||
return attr->name;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Unhashes the dentry corresponding to given configfs_dirent
|
||||
* Called with parent inode's i_sem held.
|
||||
*/
|
||||
void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
|
||||
{
|
||||
struct dentry * dentry = sd->s_dentry;
|
||||
|
||||
if (dentry) {
|
||||
spin_lock(&dcache_lock);
|
||||
if (!(d_unhashed(dentry) && dentry->d_inode)) {
|
||||
dget_locked(dentry);
|
||||
__d_drop(dentry);
|
||||
spin_unlock(&dcache_lock);
|
||||
simple_unlink(parent->d_inode, dentry);
|
||||
} else
|
||||
spin_unlock(&dcache_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void configfs_hash_and_remove(struct dentry * dir, const char * name)
|
||||
{
|
||||
struct configfs_dirent * sd;
|
||||
struct configfs_dirent * parent_sd = dir->d_fsdata;
|
||||
|
||||
down(&dir->d_inode->i_sem);
|
||||
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
|
||||
if (!sd->s_element)
|
||||
continue;
|
||||
if (!strcmp(configfs_get_name(sd), name)) {
|
||||
list_del_init(&sd->s_sibling);
|
||||
configfs_drop_dentry(sd, dir);
|
||||
configfs_put(sd);
|
||||
break;
|
||||
}
|
||||
}
|
||||
up(&dir->d_inode->i_sem);
|
||||
}
|
||||
|
||||
|
227
fs/configfs/item.c
Normal file
227
fs/configfs/item.c
Normal file
@ -0,0 +1,227 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* item.c - library routines for handling generic config items
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on kobject:
|
||||
* kobject is Copyright (c) 2002-2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* Please see the file Documentation/filesystems/configfs.txt for
|
||||
* critical information about using the config_item interface.
|
||||
*/
|
||||
|
||||
#include <linux/string.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/stat.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <linux/configfs.h>
|
||||
|
||||
|
||||
static inline struct config_item * to_item(struct list_head * entry)
|
||||
{
|
||||
return container_of(entry,struct config_item,ci_entry);
|
||||
}
|
||||
|
||||
/* Evil kernel */
|
||||
static void config_item_release(struct kref *kref);
|
||||
|
||||
/**
|
||||
* config_item_init - initialize item.
|
||||
* @item: item in question.
|
||||
*/
|
||||
void config_item_init(struct config_item * item)
|
||||
{
|
||||
kref_init(&item->ci_kref);
|
||||
INIT_LIST_HEAD(&item->ci_entry);
|
||||
}
|
||||
|
||||
/**
|
||||
* config_item_set_name - Set the name of an item
|
||||
* @item: item.
|
||||
* @name: name.
|
||||
*
|
||||
* If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
|
||||
* dynamically allocated string that @item->ci_name points to.
|
||||
* Otherwise, use the static @item->ci_namebuf array.
|
||||
*/
|
||||
|
||||
int config_item_set_name(struct config_item * item, const char * fmt, ...)
|
||||
{
|
||||
int error = 0;
|
||||
int limit = CONFIGFS_ITEM_NAME_LEN;
|
||||
int need;
|
||||
va_list args;
|
||||
char * name;
|
||||
|
||||
/*
|
||||
* First, try the static array
|
||||
*/
|
||||
va_start(args,fmt);
|
||||
need = vsnprintf(item->ci_namebuf,limit,fmt,args);
|
||||
va_end(args);
|
||||
if (need < limit)
|
||||
name = item->ci_namebuf;
|
||||
else {
|
||||
/*
|
||||
* Need more space? Allocate it and try again
|
||||
*/
|
||||
limit = need + 1;
|
||||
name = kmalloc(limit,GFP_KERNEL);
|
||||
if (!name) {
|
||||
error = -ENOMEM;
|
||||
goto Done;
|
||||
}
|
||||
va_start(args,fmt);
|
||||
need = vsnprintf(name,limit,fmt,args);
|
||||
va_end(args);
|
||||
|
||||
/* Still? Give up. */
|
||||
if (need >= limit) {
|
||||
kfree(name);
|
||||
error = -EFAULT;
|
||||
goto Done;
|
||||
}
|
||||
}
|
||||
|
||||
/* Free the old name, if necessary. */
|
||||
if (item->ci_name && item->ci_name != item->ci_namebuf)
|
||||
kfree(item->ci_name);
|
||||
|
||||
/* Now, set the new name */
|
||||
item->ci_name = name;
|
||||
Done:
|
||||
return error;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(config_item_set_name);
|
||||
|
||||
void config_item_init_type_name(struct config_item *item,
|
||||
const char *name,
|
||||
struct config_item_type *type)
|
||||
{
|
||||
config_item_set_name(item, name);
|
||||
item->ci_type = type;
|
||||
config_item_init(item);
|
||||
}
|
||||
EXPORT_SYMBOL(config_item_init_type_name);
|
||||
|
||||
void config_group_init_type_name(struct config_group *group, const char *name,
|
||||
struct config_item_type *type)
|
||||
{
|
||||
config_item_set_name(&group->cg_item, name);
|
||||
group->cg_item.ci_type = type;
|
||||
config_group_init(group);
|
||||
}
|
||||
EXPORT_SYMBOL(config_group_init_type_name);
|
||||
|
||||
struct config_item * config_item_get(struct config_item * item)
|
||||
{
|
||||
if (item)
|
||||
kref_get(&item->ci_kref);
|
||||
return item;
|
||||
}
|
||||
|
||||
/**
|
||||
* config_item_cleanup - free config_item resources.
|
||||
* @item: item.
|
||||
*/
|
||||
|
||||
void config_item_cleanup(struct config_item * item)
|
||||
{
|
||||
struct config_item_type * t = item->ci_type;
|
||||
struct config_group * s = item->ci_group;
|
||||
struct config_item * parent = item->ci_parent;
|
||||
|
||||
pr_debug("config_item %s: cleaning up\n",config_item_name(item));
|
||||
if (item->ci_name != item->ci_namebuf)
|
||||
kfree(item->ci_name);
|
||||
item->ci_name = NULL;
|
||||
if (t && t->ct_item_ops && t->ct_item_ops->release)
|
||||
t->ct_item_ops->release(item);
|
||||
if (s)
|
||||
config_group_put(s);
|
||||
if (parent)
|
||||
config_item_put(parent);
|
||||
}
|
||||
|
||||
static void config_item_release(struct kref *kref)
|
||||
{
|
||||
config_item_cleanup(container_of(kref, struct config_item, ci_kref));
|
||||
}
|
||||
|
||||
/**
|
||||
* config_item_put - decrement refcount for item.
|
||||
* @item: item.
|
||||
*
|
||||
* Decrement the refcount, and if 0, call config_item_cleanup().
|
||||
*/
|
||||
void config_item_put(struct config_item * item)
|
||||
{
|
||||
if (item)
|
||||
kref_put(&item->ci_kref, config_item_release);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* config_group_init - initialize a group for use
|
||||
* @k: group
|
||||
*/
|
||||
|
||||
void config_group_init(struct config_group *group)
|
||||
{
|
||||
config_item_init(&group->cg_item);
|
||||
INIT_LIST_HEAD(&group->cg_children);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* config_group_find_obj - search for item in group.
|
||||
* @group: group we're looking in.
|
||||
* @name: item's name.
|
||||
*
|
||||
* Lock group via @group->cg_subsys, and iterate over @group->cg_list,
|
||||
* looking for a matching config_item. If matching item is found
|
||||
* take a reference and return the item.
|
||||
*/
|
||||
|
||||
struct config_item * config_group_find_obj(struct config_group * group, const char * name)
|
||||
{
|
||||
struct list_head * entry;
|
||||
struct config_item * ret = NULL;
|
||||
|
||||
/* XXX LOCKING! */
|
||||
list_for_each(entry,&group->cg_children) {
|
||||
struct config_item * item = to_item(entry);
|
||||
if (config_item_name(item) &&
|
||||
!strcmp(config_item_name(item), name)) {
|
||||
ret = config_item_get(item);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
EXPORT_SYMBOL(config_item_init);
|
||||
EXPORT_SYMBOL(config_group_init);
|
||||
EXPORT_SYMBOL(config_item_get);
|
||||
EXPORT_SYMBOL(config_item_put);
|
||||
|
159
fs/configfs/mount.c
Normal file
159
fs/configfs/mount.c
Normal file
@ -0,0 +1,159 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* mount.c - operations for initializing and mounting configfs.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on sysfs:
|
||||
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/init.h>
|
||||
|
||||
#include <linux/configfs.h>
|
||||
#include "configfs_internal.h"
|
||||
|
||||
/* Random magic number */
|
||||
#define CONFIGFS_MAGIC 0x62656570
|
||||
|
||||
struct vfsmount * configfs_mount = NULL;
|
||||
struct super_block * configfs_sb = NULL;
|
||||
static int configfs_mnt_count = 0;
|
||||
|
||||
static struct super_operations configfs_ops = {
|
||||
.statfs = simple_statfs,
|
||||
.drop_inode = generic_delete_inode,
|
||||
};
|
||||
|
||||
static struct config_group configfs_root_group = {
|
||||
.cg_item = {
|
||||
.ci_namebuf = "root",
|
||||
.ci_name = configfs_root_group.cg_item.ci_namebuf,
|
||||
},
|
||||
};
|
||||
|
||||
int configfs_is_root(struct config_item *item)
|
||||
{
|
||||
return item == &configfs_root_group.cg_item;
|
||||
}
|
||||
|
||||
static struct configfs_dirent configfs_root = {
|
||||
.s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling),
|
||||
.s_children = LIST_HEAD_INIT(configfs_root.s_children),
|
||||
.s_element = &configfs_root_group.cg_item,
|
||||
.s_type = CONFIGFS_ROOT,
|
||||
};
|
||||
|
||||
static int configfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
{
|
||||
struct inode *inode;
|
||||
struct dentry *root;
|
||||
|
||||
sb->s_blocksize = PAGE_CACHE_SIZE;
|
||||
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
|
||||
sb->s_magic = CONFIGFS_MAGIC;
|
||||
sb->s_op = &configfs_ops;
|
||||
configfs_sb = sb;
|
||||
|
||||
inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO);
|
||||
if (inode) {
|
||||
inode->i_op = &configfs_dir_inode_operations;
|
||||
inode->i_fop = &configfs_dir_operations;
|
||||
/* directory inodes start off with i_nlink == 2 (for "." entry) */
|
||||
inode->i_nlink++;
|
||||
} else {
|
||||
pr_debug("configfs: could not get root inode\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
root = d_alloc_root(inode);
|
||||
if (!root) {
|
||||
pr_debug("%s: could not get root dentry!\n",__FUNCTION__);
|
||||
iput(inode);
|
||||
return -ENOMEM;
|
||||
}
|
||||
config_group_init(&configfs_root_group);
|
||||
configfs_root_group.cg_item.ci_dentry = root;
|
||||
root->d_fsdata = &configfs_root;
|
||||
sb->s_root = root;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct super_block *configfs_get_sb(struct file_system_type *fs_type,
|
||||
int flags, const char *dev_name, void *data)
|
||||
{
|
||||
return get_sb_single(fs_type, flags, data, configfs_fill_super);
|
||||
}
|
||||
|
||||
static struct file_system_type configfs_fs_type = {
|
||||
.owner = THIS_MODULE,
|
||||
.name = "configfs",
|
||||
.get_sb = configfs_get_sb,
|
||||
.kill_sb = kill_litter_super,
|
||||
};
|
||||
|
||||
int configfs_pin_fs(void)
|
||||
{
|
||||
return simple_pin_fs("configfs", &configfs_mount,
|
||||
&configfs_mnt_count);
|
||||
}
|
||||
|
||||
void configfs_release_fs(void)
|
||||
{
|
||||
simple_release_fs(&configfs_mount, &configfs_mnt_count);
|
||||
}
|
||||
|
||||
|
||||
static decl_subsys(config, NULL, NULL);
|
||||
|
||||
static int __init configfs_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
kset_set_kset_s(&config_subsys, kernel_subsys);
|
||||
err = subsystem_register(&config_subsys);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = register_filesystem(&configfs_fs_type);
|
||||
if (err) {
|
||||
printk(KERN_ERR "configfs: Unable to register filesystem!\n");
|
||||
subsystem_unregister(&config_subsys);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit configfs_exit(void)
|
||||
{
|
||||
unregister_filesystem(&configfs_fs_type);
|
||||
subsystem_unregister(&config_subsys);
|
||||
}
|
||||
|
||||
MODULE_AUTHOR("Oracle");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_VERSION("0.0.1");
|
||||
MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration.");
|
||||
|
||||
module_init(configfs_init);
|
||||
module_exit(configfs_exit);
|
281
fs/configfs/symlink.c
Normal file
281
fs/configfs/symlink.c
Normal file
@ -0,0 +1,281 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* symlink.c - operations for configfs symlinks.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
* Based on sysfs:
|
||||
* sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel
|
||||
*
|
||||
* configfs Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/namei.h>
|
||||
|
||||
#include <linux/configfs.h>
|
||||
#include "configfs_internal.h"
|
||||
|
||||
static int item_depth(struct config_item * item)
|
||||
{
|
||||
struct config_item * p = item;
|
||||
int depth = 0;
|
||||
do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p));
|
||||
return depth;
|
||||
}
|
||||
|
||||
static int item_path_length(struct config_item * item)
|
||||
{
|
||||
struct config_item * p = item;
|
||||
int length = 1;
|
||||
do {
|
||||
length += strlen(config_item_name(p)) + 1;
|
||||
p = p->ci_parent;
|
||||
} while (p && !configfs_is_root(p));
|
||||
return length;
|
||||
}
|
||||
|
||||
static void fill_item_path(struct config_item * item, char * buffer, int length)
|
||||
{
|
||||
struct config_item * p;
|
||||
|
||||
--length;
|
||||
for (p = item; p && !configfs_is_root(p); p = p->ci_parent) {
|
||||
int cur = strlen(config_item_name(p));
|
||||
|
||||
/* back up enough to print this bus id with '/' */
|
||||
length -= cur;
|
||||
strncpy(buffer + length,config_item_name(p),cur);
|
||||
*(buffer + --length) = '/';
|
||||
}
|
||||
}
|
||||
|
||||
static int create_link(struct config_item *parent_item,
|
||||
struct config_item *item,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
|
||||
struct configfs_symlink *sl;
|
||||
int ret;
|
||||
|
||||
ret = -ENOMEM;
|
||||
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
|
||||
if (sl) {
|
||||
sl->sl_target = config_item_get(item);
|
||||
/* FIXME: needs a lock, I'd bet */
|
||||
list_add(&sl->sl_list, &target_sd->s_links);
|
||||
ret = configfs_create_link(sl, parent_item->ci_dentry,
|
||||
dentry);
|
||||
if (ret) {
|
||||
list_del_init(&sl->sl_list);
|
||||
config_item_put(item);
|
||||
kfree(sl);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int get_target(const char *symname, struct nameidata *nd,
|
||||
struct config_item **target)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd);
|
||||
if (!ret) {
|
||||
if (nd->dentry->d_sb == configfs_sb) {
|
||||
*target = configfs_get_config_item(nd->dentry);
|
||||
if (!*target) {
|
||||
ret = -ENOENT;
|
||||
path_release(nd);
|
||||
}
|
||||
} else
|
||||
ret = -EPERM;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
|
||||
{
|
||||
int ret;
|
||||
struct nameidata nd;
|
||||
struct config_item *parent_item;
|
||||
struct config_item *target_item;
|
||||
struct config_item_type *type;
|
||||
|
||||
ret = -EPERM; /* What lack-of-symlink returns */
|
||||
if (dentry->d_parent == configfs_sb->s_root)
|
||||
goto out;
|
||||
|
||||
parent_item = configfs_get_config_item(dentry->d_parent);
|
||||
type = parent_item->ci_type;
|
||||
|
||||
if (!type || !type->ct_item_ops ||
|
||||
!type->ct_item_ops->allow_link)
|
||||
goto out_put;
|
||||
|
||||
ret = get_target(symname, &nd, &target_item);
|
||||
if (ret)
|
||||
goto out_put;
|
||||
|
||||
ret = type->ct_item_ops->allow_link(parent_item, target_item);
|
||||
if (!ret)
|
||||
ret = create_link(parent_item, target_item, dentry);
|
||||
|
||||
config_item_put(target_item);
|
||||
path_release(&nd);
|
||||
|
||||
out_put:
|
||||
config_item_put(parent_item);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int configfs_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
struct configfs_dirent *sd = dentry->d_fsdata;
|
||||
struct configfs_symlink *sl;
|
||||
struct config_item *parent_item;
|
||||
struct config_item_type *type;
|
||||
int ret;
|
||||
|
||||
ret = -EPERM; /* What lack-of-symlink returns */
|
||||
if (!(sd->s_type & CONFIGFS_ITEM_LINK))
|
||||
goto out;
|
||||
|
||||
if (dentry->d_parent == configfs_sb->s_root)
|
||||
BUG();
|
||||
|
||||
sl = sd->s_element;
|
||||
|
||||
parent_item = configfs_get_config_item(dentry->d_parent);
|
||||
type = parent_item->ci_type;
|
||||
|
||||
list_del_init(&sd->s_sibling);
|
||||
configfs_drop_dentry(sd, dentry->d_parent);
|
||||
dput(dentry);
|
||||
configfs_put(sd);
|
||||
|
||||
/*
|
||||
* drop_link() must be called before
|
||||
* list_del_init(&sl->sl_list), so that the order of
|
||||
* drop_link(this, target) and drop_item(target) is preserved.
|
||||
*/
|
||||
if (type && type->ct_item_ops &&
|
||||
type->ct_item_ops->drop_link)
|
||||
type->ct_item_ops->drop_link(parent_item,
|
||||
sl->sl_target);
|
||||
|
||||
/* FIXME: Needs lock */
|
||||
list_del_init(&sl->sl_list);
|
||||
|
||||
/* Put reference from create_link() */
|
||||
config_item_put(sl->sl_target);
|
||||
kfree(sl);
|
||||
|
||||
config_item_put(parent_item);
|
||||
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int configfs_get_target_path(struct config_item * item, struct config_item * target,
|
||||
char *path)
|
||||
{
|
||||
char * s;
|
||||
int depth, size;
|
||||
|
||||
depth = item_depth(item);
|
||||
size = item_path_length(target) + depth * 3 - 1;
|
||||
if (size > PATH_MAX)
|
||||
return -ENAMETOOLONG;
|
||||
|
||||
pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size);
|
||||
|
||||
for (s = path; depth--; s += 3)
|
||||
strcpy(s,"../");
|
||||
|
||||
fill_item_path(target, path, size);
|
||||
pr_debug("%s: path = '%s'\n", __FUNCTION__, path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int configfs_getlink(struct dentry *dentry, char * path)
|
||||
{
|
||||
struct config_item *item, *target_item;
|
||||
int error = 0;
|
||||
|
||||
item = configfs_get_config_item(dentry->d_parent);
|
||||
if (!item)
|
||||
return -EINVAL;
|
||||
|
||||
target_item = configfs_get_config_item(dentry);
|
||||
if (!target_item) {
|
||||
config_item_put(item);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
down_read(&configfs_rename_sem);
|
||||
error = configfs_get_target_path(item, target_item, path);
|
||||
up_read(&configfs_rename_sem);
|
||||
|
||||
config_item_put(item);
|
||||
config_item_put(target_item);
|
||||
return error;
|
||||
|
||||
}
|
||||
|
||||
static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
int error = -ENOMEM;
|
||||
unsigned long page = get_zeroed_page(GFP_KERNEL);
|
||||
|
||||
if (page) {
|
||||
error = configfs_getlink(dentry, (char *)page);
|
||||
if (!error) {
|
||||
nd_set_link(nd, (char *)page);
|
||||
return (void *)page;
|
||||
}
|
||||
}
|
||||
|
||||
nd_set_link(nd, ERR_PTR(error));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void configfs_put_link(struct dentry *dentry, struct nameidata *nd,
|
||||
void *cookie)
|
||||
{
|
||||
if (cookie) {
|
||||
unsigned long page = (unsigned long)cookie;
|
||||
free_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
struct inode_operations configfs_symlink_inode_operations = {
|
||||
.follow_link = configfs_follow_link,
|
||||
.readlink = generic_readlink,
|
||||
.put_link = configfs_put_link,
|
||||
};
|
||||
|
@ -721,7 +721,7 @@ retry:
|
||||
&last_block_in_bio, &ret, wbc,
|
||||
page->mapping->a_ops->writepage);
|
||||
}
|
||||
if (unlikely(ret == WRITEPAGE_ACTIVATE))
|
||||
if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
|
||||
unlock_page(page);
|
||||
if (ret || (--(wbc->nr_to_write) <= 0))
|
||||
done = 1;
|
||||
|
33
fs/ocfs2/Makefile
Normal file
33
fs/ocfs2/Makefile
Normal file
@ -0,0 +1,33 @@
|
||||
EXTRA_CFLAGS += -Ifs/ocfs2
|
||||
|
||||
EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
|
||||
|
||||
obj-$(CONFIG_OCFS2_FS) += ocfs2.o
|
||||
|
||||
ocfs2-objs := \
|
||||
alloc.o \
|
||||
aops.o \
|
||||
buffer_head_io.o \
|
||||
dcache.o \
|
||||
dir.o \
|
||||
dlmglue.o \
|
||||
export.o \
|
||||
extent_map.o \
|
||||
file.o \
|
||||
heartbeat.o \
|
||||
inode.o \
|
||||
journal.o \
|
||||
localalloc.o \
|
||||
mmap.o \
|
||||
namei.o \
|
||||
slot_map.o \
|
||||
suballoc.o \
|
||||
super.o \
|
||||
symlink.o \
|
||||
sysfile.o \
|
||||
uptodate.o \
|
||||
ver.o \
|
||||
vote.o
|
||||
|
||||
obj-$(CONFIG_OCFS2_FS) += cluster/
|
||||
obj-$(CONFIG_OCFS2_FS) += dlm/
|
2040
fs/ocfs2/alloc.c
Normal file
2040
fs/ocfs2/alloc.c
Normal file
File diff suppressed because it is too large
Load Diff
82
fs/ocfs2/alloc.h
Normal file
82
fs/ocfs2/alloc.h
Normal file
@ -0,0 +1,82 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* alloc.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_ALLOC_H
|
||||
#define OCFS2_ALLOC_H
|
||||
|
||||
struct ocfs2_alloc_context;
|
||||
int ocfs2_insert_extent(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode,
|
||||
struct buffer_head *fe_bh,
|
||||
u64 blkno,
|
||||
u32 new_clusters,
|
||||
struct ocfs2_alloc_context *meta_ac);
|
||||
int ocfs2_num_free_extents(struct ocfs2_super *osb,
|
||||
struct inode *inode,
|
||||
struct ocfs2_dinode *fe);
|
||||
/* how many new metadata chunks would an allocation need at maximum? */
|
||||
static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
|
||||
{
|
||||
/*
|
||||
* Rather than do all the work of determining how much we need
|
||||
* (involves a ton of reads and locks), just ask for the
|
||||
* maximal limit. That's a tree depth shift. So, one block for
|
||||
* level of the tree (current l_tree_depth), one block for the
|
||||
* new tree_depth==0 extent_block, and one block at the new
|
||||
* top-of-the tree.
|
||||
*/
|
||||
return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
|
||||
}
|
||||
|
||||
int ocfs2_truncate_log_init(struct ocfs2_super *osb);
|
||||
void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
|
||||
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
|
||||
int cancel);
|
||||
int ocfs2_flush_truncate_log(struct ocfs2_super *osb);
|
||||
int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
|
||||
int slot_num,
|
||||
struct ocfs2_dinode **tl_copy);
|
||||
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
|
||||
struct ocfs2_dinode *tl_copy);
|
||||
|
||||
struct ocfs2_truncate_context {
|
||||
struct inode *tc_ext_alloc_inode;
|
||||
struct buffer_head *tc_ext_alloc_bh;
|
||||
int tc_ext_alloc_locked; /* is it cluster locked? */
|
||||
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
|
||||
struct buffer_head *tc_last_eb_bh;
|
||||
};
|
||||
|
||||
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
|
||||
struct inode *inode,
|
||||
struct buffer_head *fe_bh,
|
||||
struct ocfs2_truncate_context **tc);
|
||||
int ocfs2_commit_truncate(struct ocfs2_super *osb,
|
||||
struct inode *inode,
|
||||
struct buffer_head *fe_bh,
|
||||
struct ocfs2_truncate_context *tc);
|
||||
|
||||
#endif /* OCFS2_ALLOC_H */
|
643
fs/ocfs2/aops.c
Normal file
643
fs/ocfs2/aops.c
Normal file
@ -0,0 +1,643 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <asm/byteorder.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_FILE_IO
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "aops.h"
|
||||
#include "dlmglue.h"
|
||||
#include "extent_map.h"
|
||||
#include "file.h"
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "super.h"
|
||||
#include "symlink.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int err = -EIO;
|
||||
int status;
|
||||
struct ocfs2_dinode *fe = NULL;
|
||||
struct buffer_head *bh = NULL;
|
||||
struct buffer_head *buffer_cache_bh = NULL;
|
||||
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
||||
void *kaddr;
|
||||
|
||||
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
|
||||
(unsigned long long)iblock, bh_result, create);
|
||||
|
||||
BUG_ON(ocfs2_inode_is_fast_symlink(inode));
|
||||
|
||||
if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
|
||||
mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
|
||||
(unsigned long long)iblock);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
|
||||
OCFS2_I(inode)->ip_blkno,
|
||||
&bh, OCFS2_BH_CACHED, inode);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
fe = (struct ocfs2_dinode *) bh->b_data;
|
||||
|
||||
if (!OCFS2_IS_VALID_DINODE(fe)) {
|
||||
mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n",
|
||||
fe->i_blkno, 7, fe->i_signature);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
|
||||
le32_to_cpu(fe->i_clusters))) {
|
||||
mlog(ML_ERROR, "block offset is outside the allocated size: "
|
||||
"%llu\n", (unsigned long long)iblock);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* We don't use the page cache to create symlink data, so if
|
||||
* need be, copy it over from the buffer cache. */
|
||||
if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
|
||||
u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
|
||||
iblock;
|
||||
buffer_cache_bh = sb_getblk(osb->sb, blkno);
|
||||
if (!buffer_cache_bh) {
|
||||
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* we haven't locked out transactions, so a commit
|
||||
* could've happened. Since we've got a reference on
|
||||
* the bh, even if it commits while we're doing the
|
||||
* copy, the data is still good. */
|
||||
if (buffer_jbd(buffer_cache_bh)
|
||||
&& ocfs2_inode_is_new(inode)) {
|
||||
kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
|
||||
if (!kaddr) {
|
||||
mlog(ML_ERROR, "couldn't kmap!\n");
|
||||
goto bail;
|
||||
}
|
||||
memcpy(kaddr + (bh_result->b_size * iblock),
|
||||
buffer_cache_bh->b_data,
|
||||
bh_result->b_size);
|
||||
kunmap_atomic(kaddr, KM_USER0);
|
||||
set_buffer_uptodate(bh_result);
|
||||
}
|
||||
brelse(buffer_cache_bh);
|
||||
}
|
||||
|
||||
map_bh(bh_result, inode->i_sb,
|
||||
le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
|
||||
|
||||
err = 0;
|
||||
|
||||
bail:
|
||||
if (bh)
|
||||
brelse(bh);
|
||||
|
||||
mlog_exit(err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ocfs2_get_block(struct inode *inode, sector_t iblock,
|
||||
struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int err = 0;
|
||||
u64 p_blkno, past_eof;
|
||||
|
||||
mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
|
||||
(unsigned long long)iblock, bh_result, create);
|
||||
|
||||
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
|
||||
mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
|
||||
inode, inode->i_ino);
|
||||
|
||||
if (S_ISLNK(inode->i_mode)) {
|
||||
/* this always does I/O for some reason. */
|
||||
err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* this can happen if another node truncs after our extend! */
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
|
||||
OCFS2_I(inode)->ip_clusters))
|
||||
err = -EIO;
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
if (err)
|
||||
goto bail;
|
||||
|
||||
err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
|
||||
NULL);
|
||||
if (err) {
|
||||
mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
|
||||
"%"MLFu64", NULL)\n", err, inode,
|
||||
(unsigned long long)iblock, p_blkno);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
map_bh(bh_result, inode->i_sb, p_blkno);
|
||||
|
||||
if (bh_result->b_blocknr == 0) {
|
||||
err = -EIO;
|
||||
mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" "
|
||||
"blkno=(%"MLFu64")\n", (unsigned long long)iblock,
|
||||
p_blkno, OCFS2_I(inode)->ip_blkno);
|
||||
}
|
||||
|
||||
past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
|
||||
mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof);
|
||||
|
||||
if (create && (iblock >= past_eof))
|
||||
set_buffer_new(bh_result);
|
||||
|
||||
bail:
|
||||
if (err < 0)
|
||||
err = -EIO;
|
||||
|
||||
mlog_exit(err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ocfs2_readpage(struct file *file, struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
|
||||
int ret, unlock = 1;
|
||||
|
||||
mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
|
||||
|
||||
ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
|
||||
if (ret != 0) {
|
||||
if (ret == AOP_TRUNCATED_PAGE)
|
||||
unlock = 0;
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
down_read(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
|
||||
/*
|
||||
* i_size might have just been updated as we grabed the meta lock. We
|
||||
* might now be discovering a truncate that hit on another node.
|
||||
* block_read_full_page->get_block freaks out if it is asked to read
|
||||
* beyond the end of a file, so we check here. Callers
|
||||
* (generic_file_read, fault->nopage) are clever enough to check i_size
|
||||
* and notice that the page they just read isn't needed.
|
||||
*
|
||||
* XXX sys_readahead() seems to get that wrong?
|
||||
*/
|
||||
if (start >= i_size_read(inode)) {
|
||||
char *addr = kmap(page);
|
||||
memset(addr, 0, PAGE_SIZE);
|
||||
flush_dcache_page(page);
|
||||
kunmap(page);
|
||||
SetPageUptodate(page);
|
||||
ret = 0;
|
||||
goto out_alloc;
|
||||
}
|
||||
|
||||
ret = ocfs2_data_lock_with_page(inode, 0, page);
|
||||
if (ret != 0) {
|
||||
if (ret == AOP_TRUNCATED_PAGE)
|
||||
unlock = 0;
|
||||
mlog_errno(ret);
|
||||
goto out_alloc;
|
||||
}
|
||||
|
||||
ret = block_read_full_page(page, ocfs2_get_block);
|
||||
unlock = 0;
|
||||
|
||||
ocfs2_data_unlock(inode, 0);
|
||||
out_alloc:
|
||||
up_read(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
ocfs2_meta_unlock(inode, 0);
|
||||
out:
|
||||
if (unlock)
|
||||
unlock_page(page);
|
||||
mlog_exit(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Note: Because we don't support holes, our allocation has
|
||||
* already happened (allocation writes zeros to the file data)
|
||||
* so we don't have to worry about ordered writes in
|
||||
* ocfs2_writepage.
|
||||
*
|
||||
* ->writepage is called during the process of invalidating the page cache
|
||||
* during blocked lock processing. It can't block on any cluster locks
|
||||
* to during block mapping. It's relying on the fact that the block
|
||||
* mapping can't have disappeared under the dirty pages that it is
|
||||
* being asked to write back.
|
||||
*/
|
||||
static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mlog_entry("(0x%p)\n", page);
|
||||
|
||||
ret = block_write_full_page(page, ocfs2_get_block, wbc);
|
||||
|
||||
mlog_exit(ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
|
||||
* from loopback. It must be able to perform its own locking around
|
||||
* ocfs2_get_block().
|
||||
*/
|
||||
int ocfs2_prepare_write(struct file *file, struct page *page,
|
||||
unsigned from, unsigned to)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
int ret;
|
||||
|
||||
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
|
||||
|
||||
ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page);
|
||||
if (ret != 0) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
down_read(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
|
||||
ret = block_prepare_write(page, from, to, ocfs2_get_block);
|
||||
|
||||
up_read(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
|
||||
ocfs2_meta_unlock(inode, 0);
|
||||
out:
|
||||
mlog_exit(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Taken from ext3. We don't necessarily need the full blown
|
||||
* functionality yet, but IMHO it's better to cut and paste the whole
|
||||
* thing so we can avoid introducing our own bugs (and easily pick up
|
||||
* their fixes when they happen) --Mark */
|
||||
static int walk_page_buffers( handle_t *handle,
|
||||
struct buffer_head *head,
|
||||
unsigned from,
|
||||
unsigned to,
|
||||
int *partial,
|
||||
int (*fn)( handle_t *handle,
|
||||
struct buffer_head *bh))
|
||||
{
|
||||
struct buffer_head *bh;
|
||||
unsigned block_start, block_end;
|
||||
unsigned blocksize = head->b_size;
|
||||
int err, ret = 0;
|
||||
struct buffer_head *next;
|
||||
|
||||
for ( bh = head, block_start = 0;
|
||||
ret == 0 && (bh != head || !block_start);
|
||||
block_start = block_end, bh = next)
|
||||
{
|
||||
next = bh->b_this_page;
|
||||
block_end = block_start + blocksize;
|
||||
if (block_end <= from || block_start >= to) {
|
||||
if (partial && !buffer_uptodate(bh))
|
||||
*partial = 1;
|
||||
continue;
|
||||
}
|
||||
err = (*fn)(handle, bh);
|
||||
if (!ret)
|
||||
ret = err;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
|
||||
struct page *page,
|
||||
unsigned from,
|
||||
unsigned to)
|
||||
{
|
||||
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
||||
struct ocfs2_journal_handle *handle = NULL;
|
||||
int ret = 0;
|
||||
|
||||
handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
|
||||
if (!handle) {
|
||||
ret = -ENOMEM;
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ocfs2_should_order_data(inode)) {
|
||||
ret = walk_page_buffers(handle->k_handle,
|
||||
page_buffers(page),
|
||||
from, to, NULL,
|
||||
ocfs2_journal_dirty_data);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
}
|
||||
out:
|
||||
if (ret) {
|
||||
if (handle)
|
||||
ocfs2_commit_trans(handle);
|
||||
handle = ERR_PTR(ret);
|
||||
}
|
||||
return handle;
|
||||
}
|
||||
|
||||
static int ocfs2_commit_write(struct file *file, struct page *page,
|
||||
unsigned from, unsigned to)
|
||||
{
|
||||
int ret, extending = 0, locklevel = 0;
|
||||
loff_t new_i_size;
|
||||
struct buffer_head *di_bh = NULL;
|
||||
struct inode *inode = page->mapping->host;
|
||||
struct ocfs2_journal_handle *handle = NULL;
|
||||
|
||||
mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
|
||||
|
||||
/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
|
||||
* us to sample inode->i_size here without the metadata lock:
|
||||
*
|
||||
* 1) We're currently holding the inode alloc lock, so no
|
||||
* nodes can change it underneath us.
|
||||
*
|
||||
* 2) We've had to take the metadata lock at least once
|
||||
* already to check for extending writes, hence insuring
|
||||
* that our current copy is also up to date.
|
||||
*/
|
||||
new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
|
||||
if (new_i_size > i_size_read(inode)) {
|
||||
extending = 1;
|
||||
locklevel = 1;
|
||||
}
|
||||
|
||||
ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
|
||||
if (ret != 0) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = ocfs2_data_lock_with_page(inode, 1, page);
|
||||
if (ret != 0) {
|
||||
mlog_errno(ret);
|
||||
goto out_unlock_meta;
|
||||
}
|
||||
|
||||
if (extending) {
|
||||
handle = ocfs2_start_walk_page_trans(inode, page, from, to);
|
||||
if (IS_ERR(handle)) {
|
||||
ret = PTR_ERR(handle);
|
||||
handle = NULL;
|
||||
goto out_unlock_data;
|
||||
}
|
||||
|
||||
/* Mark our buffer early. We'd rather catch this error up here
|
||||
* as opposed to after a successful commit_write which would
|
||||
* require us to set back inode->i_size. */
|
||||
ret = ocfs2_journal_access(handle, inode, di_bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (ret < 0) {
|
||||
mlog_errno(ret);
|
||||
goto out_commit;
|
||||
}
|
||||
}
|
||||
|
||||
/* might update i_size */
|
||||
ret = generic_commit_write(file, page, from, to);
|
||||
if (ret < 0) {
|
||||
mlog_errno(ret);
|
||||
goto out_commit;
|
||||
}
|
||||
|
||||
if (extending) {
|
||||
loff_t size = (u64) i_size_read(inode);
|
||||
struct ocfs2_dinode *di =
|
||||
(struct ocfs2_dinode *)di_bh->b_data;
|
||||
|
||||
/* ocfs2_mark_inode_dirty is too heavy to use here. */
|
||||
inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
|
||||
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
|
||||
|
||||
di->i_size = cpu_to_le64(size);
|
||||
di->i_ctime = di->i_mtime =
|
||||
cpu_to_le64(inode->i_mtime.tv_sec);
|
||||
di->i_ctime_nsec = di->i_mtime_nsec =
|
||||
cpu_to_le32(inode->i_mtime.tv_nsec);
|
||||
|
||||
ret = ocfs2_journal_dirty(handle, di_bh);
|
||||
if (ret < 0) {
|
||||
mlog_errno(ret);
|
||||
goto out_commit;
|
||||
}
|
||||
}
|
||||
|
||||
BUG_ON(extending && (i_size_read(inode) != new_i_size));
|
||||
|
||||
out_commit:
|
||||
if (handle)
|
||||
ocfs2_commit_trans(handle);
|
||||
out_unlock_data:
|
||||
ocfs2_data_unlock(inode, 1);
|
||||
out_unlock_meta:
|
||||
ocfs2_meta_unlock(inode, locklevel);
|
||||
out:
|
||||
if (di_bh)
|
||||
brelse(di_bh);
|
||||
|
||||
mlog_exit(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
|
||||
{
|
||||
sector_t status;
|
||||
u64 p_blkno = 0;
|
||||
int err = 0;
|
||||
struct inode *inode = mapping->host;
|
||||
|
||||
mlog_entry("(block = %llu)\n", (unsigned long long)block);
|
||||
|
||||
/* We don't need to lock journal system files, since they aren't
|
||||
* accessed concurrently from multiple nodes.
|
||||
*/
|
||||
if (!INODE_JOURNAL(inode)) {
|
||||
err = ocfs2_meta_lock(inode, NULL, NULL, 0);
|
||||
if (err) {
|
||||
if (err != -ENOENT)
|
||||
mlog_errno(err);
|
||||
goto bail;
|
||||
}
|
||||
down_read(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
}
|
||||
|
||||
err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
|
||||
NULL);
|
||||
|
||||
if (!INODE_JOURNAL(inode)) {
|
||||
up_read(&OCFS2_I(inode)->ip_alloc_sem);
|
||||
ocfs2_meta_unlock(inode, 0);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
|
||||
(unsigned long long)block);
|
||||
mlog_errno(err);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
|
||||
bail:
|
||||
status = err ? 0 : p_blkno;
|
||||
|
||||
mlog_exit((int)status);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: Make this into a generic get_blocks function.
|
||||
*
|
||||
* From do_direct_io in direct-io.c:
|
||||
* "So what we do is to permit the ->get_blocks function to populate
|
||||
* bh.b_size with the size of IO which is permitted at this offset and
|
||||
* this i_blkbits."
|
||||
*
|
||||
* This function is called directly from get_more_blocks in direct-io.c.
|
||||
*
|
||||
* called like this: dio->get_blocks(dio->inode, fs_startblk,
|
||||
* fs_count, map_bh, dio->rw == WRITE);
|
||||
*/
|
||||
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
|
||||
unsigned long max_blocks,
|
||||
struct buffer_head *bh_result, int create)
|
||||
{
|
||||
int ret;
|
||||
u64 vbo_max; /* file offset, max_blocks from iblock */
|
||||
u64 p_blkno;
|
||||
int contig_blocks;
|
||||
unsigned char blocksize_bits;
|
||||
|
||||
if (!inode || !bh_result) {
|
||||
mlog(ML_ERROR, "inode or bh_result is null\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
blocksize_bits = inode->i_sb->s_blocksize_bits;
|
||||
|
||||
/* This function won't even be called if the request isn't all
|
||||
* nicely aligned and of the right size, so there's no need
|
||||
* for us to check any of that. */
|
||||
|
||||
vbo_max = ((u64)iblock + max_blocks) << blocksize_bits;
|
||||
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
if ((iblock + max_blocks) >
|
||||
ocfs2_clusters_to_blocks(inode->i_sb,
|
||||
OCFS2_I(inode)->ip_clusters)) {
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
ret = -EIO;
|
||||
goto bail;
|
||||
}
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
/* This figures out the size of the next contiguous block, and
|
||||
* our logical offset */
|
||||
ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
|
||||
&contig_blocks);
|
||||
if (ret) {
|
||||
mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
|
||||
(unsigned long long)iblock);
|
||||
ret = -EIO;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
map_bh(bh_result, inode->i_sb, p_blkno);
|
||||
|
||||
/* make sure we don't map more than max_blocks blocks here as
|
||||
that's all the kernel will handle at this point. */
|
||||
if (max_blocks < contig_blocks)
|
||||
contig_blocks = max_blocks;
|
||||
bh_result->b_size = contig_blocks << blocksize_bits;
|
||||
bail:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
|
||||
* particularly interested in the aio/dio case. Like the core uses
|
||||
* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
|
||||
* truncation on another.
|
||||
*/
|
||||
static void ocfs2_dio_end_io(struct kiocb *iocb,
|
||||
loff_t offset,
|
||||
ssize_t bytes,
|
||||
void *private)
|
||||
{
|
||||
struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
|
||||
|
||||
/* this io's submitter should not have unlocked this before we could */
|
||||
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
|
||||
ocfs2_iocb_clear_rw_locked(iocb);
|
||||
up_read(&inode->i_alloc_sem);
|
||||
ocfs2_rw_unlock(inode, 0);
|
||||
}
|
||||
|
||||
static ssize_t ocfs2_direct_IO(int rw,
|
||||
struct kiocb *iocb,
|
||||
const struct iovec *iov,
|
||||
loff_t offset,
|
||||
unsigned long nr_segs)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
|
||||
int ret;
|
||||
|
||||
mlog_entry_void();
|
||||
ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
|
||||
inode->i_sb->s_bdev, iov, offset,
|
||||
nr_segs,
|
||||
ocfs2_direct_IO_get_blocks,
|
||||
ocfs2_dio_end_io);
|
||||
mlog_exit(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct address_space_operations ocfs2_aops = {
|
||||
.readpage = ocfs2_readpage,
|
||||
.writepage = ocfs2_writepage,
|
||||
.prepare_write = ocfs2_prepare_write,
|
||||
.commit_write = ocfs2_commit_write,
|
||||
.bmap = ocfs2_bmap,
|
||||
.sync_page = block_sync_page,
|
||||
.direct_IO = ocfs2_direct_IO
|
||||
};
|
41
fs/ocfs2/aops.h
Normal file
41
fs/ocfs2/aops.h
Normal file
@ -0,0 +1,41 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_AOPS_H
|
||||
#define OCFS2_AOPS_H
|
||||
|
||||
int ocfs2_prepare_write(struct file *file, struct page *page,
|
||||
unsigned from, unsigned to);
|
||||
|
||||
struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
|
||||
struct page *page,
|
||||
unsigned from,
|
||||
unsigned to);
|
||||
|
||||
/* all ocfs2_dio_end_io()'s fault */
|
||||
#define ocfs2_iocb_is_rw_locked(iocb) \
|
||||
test_bit(0, (unsigned long *)&iocb->private)
|
||||
#define ocfs2_iocb_set_rw_locked(iocb) \
|
||||
set_bit(0, (unsigned long *)&iocb->private)
|
||||
#define ocfs2_iocb_clear_rw_locked(iocb) \
|
||||
clear_bit(0, (unsigned long *)&iocb->private)
|
||||
|
||||
#endif /* OCFS2_FILE_H */
|
232
fs/ocfs2/buffer_head_io.c
Normal file
232
fs/ocfs2/buffer_head_io.c
Normal file
@ -0,0 +1,232 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* io.c
|
||||
*
|
||||
* Buffer cache handling
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "uptodate.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
|
||||
struct inode *inode)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
|
||||
(unsigned long long)bh->b_blocknr, inode);
|
||||
|
||||
BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
|
||||
BUG_ON(buffer_jbd(bh));
|
||||
|
||||
/* No need to check for a soft readonly file system here. non
|
||||
* journalled writes are only ever done on system files which
|
||||
* can get modified during recovery even if read-only. */
|
||||
if (ocfs2_is_hard_readonly(osb)) {
|
||||
ret = -EROFS;
|
||||
goto out;
|
||||
}
|
||||
|
||||
down(&OCFS2_I(inode)->ip_io_sem);
|
||||
|
||||
lock_buffer(bh);
|
||||
set_buffer_uptodate(bh);
|
||||
|
||||
/* remove from dirty list before I/O. */
|
||||
clear_buffer_dirty(bh);
|
||||
|
||||
get_bh(bh); /* for end_buffer_write_sync() */
|
||||
bh->b_end_io = end_buffer_write_sync;
|
||||
submit_bh(WRITE, bh);
|
||||
|
||||
wait_on_buffer(bh);
|
||||
|
||||
if (buffer_uptodate(bh)) {
|
||||
ocfs2_set_buffer_uptodate(inode, bh);
|
||||
} else {
|
||||
/* We don't need to remove the clustered uptodate
|
||||
* information for this bh as it's not marked locally
|
||||
* uptodate. */
|
||||
ret = -EIO;
|
||||
brelse(bh);
|
||||
}
|
||||
|
||||
up(&OCFS2_I(inode)->ip_io_sem);
|
||||
out:
|
||||
mlog_exit(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
|
||||
struct buffer_head *bhs[], int flags,
|
||||
struct inode *inode)
|
||||
{
|
||||
int status = 0;
|
||||
struct super_block *sb;
|
||||
int i, ignore_cache = 0;
|
||||
struct buffer_head *bh;
|
||||
|
||||
mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n",
|
||||
block, nr, flags, inode);
|
||||
|
||||
if (osb == NULL || osb->sb == NULL || bhs == NULL) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (nr < 0) {
|
||||
mlog(ML_ERROR, "asked to read %d blocks!\n", nr);
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (nr == 0) {
|
||||
mlog(ML_BH_IO, "No buffers will be read!\n");
|
||||
status = 0;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
sb = osb->sb;
|
||||
|
||||
if (flags & OCFS2_BH_CACHED && !inode)
|
||||
flags &= ~OCFS2_BH_CACHED;
|
||||
|
||||
if (inode)
|
||||
down(&OCFS2_I(inode)->ip_io_sem);
|
||||
for (i = 0 ; i < nr ; i++) {
|
||||
if (bhs[i] == NULL) {
|
||||
bhs[i] = sb_getblk(sb, block++);
|
||||
if (bhs[i] == NULL) {
|
||||
if (inode)
|
||||
up(&OCFS2_I(inode)->ip_io_sem);
|
||||
status = -EIO;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
bh = bhs[i];
|
||||
ignore_cache = 0;
|
||||
|
||||
if (flags & OCFS2_BH_CACHED &&
|
||||
!ocfs2_buffer_uptodate(inode, bh)) {
|
||||
mlog(ML_UPTODATE,
|
||||
"bh (%llu), inode %"MLFu64" not uptodate\n",
|
||||
(unsigned long long)bh->b_blocknr,
|
||||
OCFS2_I(inode)->ip_blkno);
|
||||
ignore_cache = 1;
|
||||
}
|
||||
|
||||
/* XXX: Can we ever get this and *not* have the cached
|
||||
* flag set? */
|
||||
if (buffer_jbd(bh)) {
|
||||
if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
|
||||
mlog(ML_BH_IO, "trying to sync read a jbd "
|
||||
"managed bh (blocknr = %llu)\n",
|
||||
(unsigned long long)bh->b_blocknr);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
|
||||
if (buffer_dirty(bh)) {
|
||||
/* This should probably be a BUG, or
|
||||
* at least return an error. */
|
||||
mlog(ML_BH_IO, "asking me to sync read a dirty "
|
||||
"buffer! (blocknr = %llu)\n",
|
||||
(unsigned long long)bh->b_blocknr);
|
||||
continue;
|
||||
}
|
||||
|
||||
lock_buffer(bh);
|
||||
if (buffer_jbd(bh)) {
|
||||
#ifdef CATCH_BH_JBD_RACES
|
||||
mlog(ML_ERROR, "block %llu had the JBD bit set "
|
||||
"while I was in lock_buffer!",
|
||||
(unsigned long long)bh->b_blocknr);
|
||||
BUG();
|
||||
#else
|
||||
unlock_buffer(bh);
|
||||
continue;
|
||||
#endif
|
||||
}
|
||||
clear_buffer_uptodate(bh);
|
||||
get_bh(bh); /* for end_buffer_read_sync() */
|
||||
bh->b_end_io = end_buffer_read_sync;
|
||||
if (flags & OCFS2_BH_READAHEAD)
|
||||
submit_bh(READA, bh);
|
||||
else
|
||||
submit_bh(READ, bh);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
status = 0;
|
||||
|
||||
for (i = (nr - 1); i >= 0; i--) {
|
||||
bh = bhs[i];
|
||||
|
||||
/* We know this can't have changed as we hold the
|
||||
* inode sem. Avoid doing any work on the bh if the
|
||||
* journal has it. */
|
||||
if (!buffer_jbd(bh))
|
||||
wait_on_buffer(bh);
|
||||
|
||||
if (!buffer_uptodate(bh)) {
|
||||
/* Status won't be cleared from here on out,
|
||||
* so we can safely record this and loop back
|
||||
* to cleanup the other buffers. Don't need to
|
||||
* remove the clustered uptodate information
|
||||
* for this bh as it's not marked locally
|
||||
* uptodate. */
|
||||
status = -EIO;
|
||||
brelse(bh);
|
||||
bhs[i] = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (inode)
|
||||
ocfs2_set_buffer_uptodate(inode, bh);
|
||||
}
|
||||
if (inode)
|
||||
up(&OCFS2_I(inode)->ip_io_sem);
|
||||
|
||||
mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
|
||||
(!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
|
||||
|
||||
bail:
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
73
fs/ocfs2/buffer_head_io.h
Normal file
73
fs/ocfs2/buffer_head_io.h
Normal file
@ -0,0 +1,73 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_buffer_head.h
|
||||
*
|
||||
* Buffer cache handling functions defined
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_BUFFER_HEAD_IO_H
|
||||
#define OCFS2_BUFFER_HEAD_IO_H
|
||||
|
||||
#include <linux/buffer_head.h>
|
||||
|
||||
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
|
||||
int uptodate);
|
||||
|
||||
static inline int ocfs2_read_block(struct ocfs2_super *osb,
|
||||
u64 off,
|
||||
struct buffer_head **bh,
|
||||
int flags,
|
||||
struct inode *inode);
|
||||
|
||||
int ocfs2_write_block(struct ocfs2_super *osb,
|
||||
struct buffer_head *bh,
|
||||
struct inode *inode);
|
||||
int ocfs2_read_blocks(struct ocfs2_super *osb,
|
||||
u64 block,
|
||||
int nr,
|
||||
struct buffer_head *bhs[],
|
||||
int flags,
|
||||
struct inode *inode);
|
||||
|
||||
|
||||
#define OCFS2_BH_CACHED 1
|
||||
#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */
|
||||
|
||||
static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
|
||||
struct buffer_head **bh, int flags,
|
||||
struct inode *inode)
|
||||
{
|
||||
int status = 0;
|
||||
|
||||
if (bh == NULL) {
|
||||
printk("ocfs2: bh == NULL\n");
|
||||
status = -EINVAL;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_read_blocks(osb, off, 1, bh,
|
||||
flags, inode);
|
||||
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
#endif /* OCFS2_BUFFER_HEAD_IO_H */
|
4
fs/ocfs2/cluster/Makefile
Normal file
4
fs/ocfs2/cluster/Makefile
Normal file
@ -0,0 +1,4 @@
|
||||
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
|
||||
|
||||
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
|
||||
quorum.o tcp.o ver.o
|
30
fs/ocfs2/cluster/endian.h
Normal file
30
fs/ocfs2/cluster/endian.h
Normal file
@ -0,0 +1,30 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_CLUSTER_ENDIAN_H
|
||||
#define OCFS2_CLUSTER_ENDIAN_H
|
||||
|
||||
static inline void be32_add_cpu(__be32 *var, u32 val)
|
||||
{
|
||||
*var = cpu_to_be32(be32_to_cpu(*var) + val);
|
||||
}
|
||||
|
||||
#endif /* OCFS2_CLUSTER_ENDIAN_H */
|
1797
fs/ocfs2/cluster/heartbeat.c
Normal file
1797
fs/ocfs2/cluster/heartbeat.c
Normal file
File diff suppressed because it is too large
Load Diff
82
fs/ocfs2/cluster/heartbeat.h
Normal file
82
fs/ocfs2/cluster/heartbeat.h
Normal file
@ -0,0 +1,82 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* heartbeat.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_HEARTBEAT_H
|
||||
#define O2CLUSTER_HEARTBEAT_H
|
||||
|
||||
#include "ocfs2_heartbeat.h"
|
||||
|
||||
#define O2HB_REGION_TIMEOUT_MS 2000
|
||||
|
||||
/* number of changes to be seen as live */
|
||||
#define O2HB_LIVE_THRESHOLD 2
|
||||
/* number of equal samples to be seen as dead */
|
||||
extern unsigned int o2hb_dead_threshold;
|
||||
#define O2HB_DEFAULT_DEAD_THRESHOLD 7
|
||||
/* Otherwise MAX_WRITE_TIMEOUT will be zero... */
|
||||
#define O2HB_MIN_DEAD_THRESHOLD 2
|
||||
#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1))
|
||||
|
||||
#define O2HB_CB_MAGIC 0x51d1e4ec
|
||||
|
||||
/* callback stuff */
|
||||
enum o2hb_callback_type {
|
||||
O2HB_NODE_DOWN_CB = 0,
|
||||
O2HB_NODE_UP_CB,
|
||||
O2HB_NUM_CB
|
||||
};
|
||||
|
||||
struct o2nm_node;
|
||||
typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *);
|
||||
|
||||
struct o2hb_callback_func {
|
||||
u32 hc_magic;
|
||||
struct list_head hc_item;
|
||||
o2hb_cb_func *hc_func;
|
||||
void *hc_data;
|
||||
int hc_priority;
|
||||
enum o2hb_callback_type hc_type;
|
||||
};
|
||||
|
||||
struct config_group *o2hb_alloc_hb_set(void);
|
||||
void o2hb_free_hb_set(struct config_group *group);
|
||||
|
||||
void o2hb_setup_callback(struct o2hb_callback_func *hc,
|
||||
enum o2hb_callback_type type,
|
||||
o2hb_cb_func *func,
|
||||
void *data,
|
||||
int priority);
|
||||
int o2hb_register_callback(struct o2hb_callback_func *hc);
|
||||
int o2hb_unregister_callback(struct o2hb_callback_func *hc);
|
||||
void o2hb_fill_node_map(unsigned long *map,
|
||||
unsigned bytes);
|
||||
void o2hb_init(void);
|
||||
int o2hb_check_node_heartbeating(u8 node_num);
|
||||
int o2hb_check_node_heartbeating_from_callback(u8 node_num);
|
||||
int o2hb_check_local_node_heartbeating(void);
|
||||
void o2hb_stop_all_regions(void);
|
||||
|
||||
#endif /* O2CLUSTER_HEARTBEAT_H */
|
166
fs/ocfs2/cluster/masklog.c
Normal file
166
fs/ocfs2/cluster/masklog.c
Normal file
@ -0,0 +1,166 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/string.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
#include "masklog.h"
|
||||
|
||||
struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
|
||||
EXPORT_SYMBOL_GPL(mlog_and_bits);
|
||||
struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
|
||||
EXPORT_SYMBOL_GPL(mlog_not_bits);
|
||||
|
||||
static ssize_t mlog_mask_show(u64 mask, char *buf)
|
||||
{
|
||||
char *state;
|
||||
|
||||
if (__mlog_test_u64(mask, mlog_and_bits))
|
||||
state = "allow";
|
||||
else if (__mlog_test_u64(mask, mlog_not_bits))
|
||||
state = "deny";
|
||||
else
|
||||
state = "off";
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n", state);
|
||||
}
|
||||
|
||||
static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
|
||||
{
|
||||
if (!strnicmp(buf, "allow", 5)) {
|
||||
__mlog_set_u64(mask, mlog_and_bits);
|
||||
__mlog_clear_u64(mask, mlog_not_bits);
|
||||
} else if (!strnicmp(buf, "deny", 4)) {
|
||||
__mlog_set_u64(mask, mlog_not_bits);
|
||||
__mlog_clear_u64(mask, mlog_and_bits);
|
||||
} else if (!strnicmp(buf, "off", 3)) {
|
||||
__mlog_clear_u64(mask, mlog_not_bits);
|
||||
__mlog_clear_u64(mask, mlog_and_bits);
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
struct mlog_attribute {
|
||||
struct attribute attr;
|
||||
u64 mask;
|
||||
};
|
||||
|
||||
#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr)
|
||||
|
||||
#define define_mask(_name) { \
|
||||
.attr = { \
|
||||
.name = #_name, \
|
||||
.mode = S_IRUGO | S_IWUSR, \
|
||||
}, \
|
||||
.mask = ML_##_name, \
|
||||
}
|
||||
|
||||
static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
|
||||
define_mask(ENTRY),
|
||||
define_mask(EXIT),
|
||||
define_mask(TCP),
|
||||
define_mask(MSG),
|
||||
define_mask(SOCKET),
|
||||
define_mask(HEARTBEAT),
|
||||
define_mask(HB_BIO),
|
||||
define_mask(DLMFS),
|
||||
define_mask(DLM),
|
||||
define_mask(DLM_DOMAIN),
|
||||
define_mask(DLM_THREAD),
|
||||
define_mask(DLM_MASTER),
|
||||
define_mask(DLM_RECOVERY),
|
||||
define_mask(AIO),
|
||||
define_mask(JOURNAL),
|
||||
define_mask(DISK_ALLOC),
|
||||
define_mask(SUPER),
|
||||
define_mask(FILE_IO),
|
||||
define_mask(EXTENT_MAP),
|
||||
define_mask(DLM_GLUE),
|
||||
define_mask(BH_IO),
|
||||
define_mask(UPTODATE),
|
||||
define_mask(NAMEI),
|
||||
define_mask(INODE),
|
||||
define_mask(VOTE),
|
||||
define_mask(DCACHE),
|
||||
define_mask(CONN),
|
||||
define_mask(QUORUM),
|
||||
define_mask(EXPORT),
|
||||
define_mask(ERROR),
|
||||
define_mask(NOTICE),
|
||||
define_mask(KTHREAD),
|
||||
};
|
||||
|
||||
static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, };
|
||||
|
||||
static ssize_t mlog_show(struct kobject *obj, struct attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
|
||||
|
||||
return mlog_mask_show(mlog_attr->mask, buf);
|
||||
}
|
||||
|
||||
static ssize_t mlog_store(struct kobject *obj, struct attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct mlog_attribute *mlog_attr = to_mlog_attr(attr);
|
||||
|
||||
return mlog_mask_store(mlog_attr->mask, buf, count);
|
||||
}
|
||||
|
||||
static struct sysfs_ops mlog_attr_ops = {
|
||||
.show = mlog_show,
|
||||
.store = mlog_store,
|
||||
};
|
||||
|
||||
static struct kobj_type mlog_ktype = {
|
||||
.default_attrs = mlog_attr_ptrs,
|
||||
.sysfs_ops = &mlog_attr_ops,
|
||||
};
|
||||
|
||||
static struct kset mlog_kset = {
|
||||
.kobj = {.name = "logmask", .ktype = &mlog_ktype},
|
||||
};
|
||||
|
||||
int mlog_sys_init(struct subsystem *o2cb_subsys)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
while (mlog_attrs[i].attr.mode) {
|
||||
mlog_attr_ptrs[i] = &mlog_attrs[i].attr;
|
||||
i++;
|
||||
}
|
||||
mlog_attr_ptrs[i] = NULL;
|
||||
|
||||
mlog_kset.subsys = o2cb_subsys;
|
||||
return kset_register(&mlog_kset);
|
||||
}
|
||||
|
||||
void mlog_sys_shutdown(void)
|
||||
{
|
||||
kset_unregister(&mlog_kset);
|
||||
}
|
275
fs/ocfs2/cluster/masklog.h
Normal file
275
fs/ocfs2/cluster/masklog.h
Normal file
@ -0,0 +1,275 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_MASKLOG_H
|
||||
#define O2CLUSTER_MASKLOG_H
|
||||
|
||||
/*
|
||||
* For now this is a trivial wrapper around printk() that gives the critical
|
||||
* ability to enable sets of debugging output at run-time. In the future this
|
||||
* will almost certainly be redirected to relayfs so that it can pay a
|
||||
* substantially lower heisenberg tax.
|
||||
*
|
||||
* Callers associate the message with a bitmask and a global bitmask is
|
||||
* maintained with help from /proc. If any of the bits match the message is
|
||||
* output.
|
||||
*
|
||||
* We must have efficient bit tests on i386 and it seems gcc still emits crazy
|
||||
* code for the 64bit compare. It emits very good code for the dual unsigned
|
||||
* long tests, though, completely avoiding tests that can never pass if the
|
||||
* caller gives a constant bitmask that fills one of the longs with all 0s. So
|
||||
* the desire is to have almost all of the calls decided on by comparing just
|
||||
* one of the longs. This leads to having infrequently given bits that are
|
||||
* frequently matched in the high bits.
|
||||
*
|
||||
* _ERROR and _NOTICE are used for messages that always go to the console and
|
||||
* have appropriate KERN_ prefixes. We wrap these in our function instead of
|
||||
* just calling printk() so that this can eventually make its way through
|
||||
* relayfs along with the debugging messages. Everything else gets KERN_DEBUG.
|
||||
* The inline tests and macro dance give GCC the opportunity to quite cleverly
|
||||
* only emit the appropriage printk() when the caller passes in a constant
|
||||
* mask, as is almost always the case.
|
||||
*
|
||||
* All this bitmask nonsense is hidden from the /proc interface so that Joel
|
||||
* doesn't have an aneurism. Reading the file gives a straight forward
|
||||
* indication of which bits are on or off:
|
||||
* ENTRY off
|
||||
* EXIT off
|
||||
* TCP off
|
||||
* MSG off
|
||||
* SOCKET off
|
||||
* ERROR off
|
||||
* NOTICE on
|
||||
*
|
||||
* Writing changes the state of a given bit and requires a strictly formatted
|
||||
* single write() call:
|
||||
*
|
||||
* write(fd, "ENTRY on", 8);
|
||||
*
|
||||
* would turn the entry bit on. "1" is also accepted in the place of "on", and
|
||||
* "off" and "0" behave as expected.
|
||||
*
|
||||
* Some trivial shell can flip all the bits on or off:
|
||||
*
|
||||
* log_mask="/proc/fs/ocfs2_nodemanager/log_mask"
|
||||
* cat $log_mask | (
|
||||
* while read bit status; do
|
||||
* # $1 is "on" or "off", say
|
||||
* echo "$bit $1" > $log_mask
|
||||
* done
|
||||
* )
|
||||
*/
|
||||
|
||||
/* for task_struct */
|
||||
#include <linux/sched.h>
|
||||
|
||||
/* bits that are frequently given and infrequently matched in the low word */
|
||||
/* NOTE: If you add a flag, you need to also update mlog.c! */
|
||||
#define ML_ENTRY 0x0000000000000001ULL /* func call entry */
|
||||
#define ML_EXIT 0x0000000000000002ULL /* func call exit */
|
||||
#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */
|
||||
#define ML_MSG 0x0000000000000008ULL /* net network messages */
|
||||
#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */
|
||||
#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */
|
||||
#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */
|
||||
#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */
|
||||
#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */
|
||||
#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */
|
||||
#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */
|
||||
#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */
|
||||
#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */
|
||||
#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */
|
||||
#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */
|
||||
#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */
|
||||
#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */
|
||||
#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
|
||||
#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
|
||||
#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
|
||||
#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
|
||||
#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
|
||||
#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
|
||||
#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
|
||||
#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
|
||||
#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
|
||||
#define ML_CONN 0x0000000004000000ULL /* net connection management */
|
||||
#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
|
||||
#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
|
||||
/* bits that are infrequently given and frequently matched in the high word */
|
||||
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
|
||||
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
|
||||
#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */
|
||||
|
||||
#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
|
||||
#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
|
||||
#ifndef MLOG_MASK_PREFIX
|
||||
#define MLOG_MASK_PREFIX 0
|
||||
#endif
|
||||
|
||||
#define MLOG_MAX_BITS 64
|
||||
|
||||
struct mlog_bits {
|
||||
unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG];
|
||||
};
|
||||
|
||||
extern struct mlog_bits mlog_and_bits, mlog_not_bits;
|
||||
|
||||
#if BITS_PER_LONG == 32
|
||||
|
||||
#define __mlog_test_u64(mask, bits) \
|
||||
( (u32)(mask & 0xffffffff) & bits.words[0] || \
|
||||
((u64)(mask) >> 32) & bits.words[1] )
|
||||
#define __mlog_set_u64(mask, bits) do { \
|
||||
bits.words[0] |= (u32)(mask & 0xffffffff); \
|
||||
bits.words[1] |= (u64)(mask) >> 32; \
|
||||
} while (0)
|
||||
#define __mlog_clear_u64(mask, bits) do { \
|
||||
bits.words[0] &= ~((u32)(mask & 0xffffffff)); \
|
||||
bits.words[1] &= ~((u64)(mask) >> 32); \
|
||||
} while (0)
|
||||
#define MLOG_BITS_RHS(mask) { \
|
||||
{ \
|
||||
[0] = (u32)(mask & 0xffffffff), \
|
||||
[1] = (u64)(mask) >> 32, \
|
||||
} \
|
||||
}
|
||||
|
||||
#else /* 32bit long above, 64bit long below */
|
||||
|
||||
#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0])
|
||||
#define __mlog_set_u64(mask, bits) do { \
|
||||
bits.words[0] |= (mask); \
|
||||
} while (0)
|
||||
#define __mlog_clear_u64(mask, bits) do { \
|
||||
bits.words[0] &= ~(mask); \
|
||||
} while (0)
|
||||
#define MLOG_BITS_RHS(mask) { { (mask) } }
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* smp_processor_id() "helpfully" screams when called outside preemptible
|
||||
* regions in current kernels. sles doesn't have the variants that don't
|
||||
* scream. just do this instead of trying to guess which we're building
|
||||
* against.. *sigh*.
|
||||
*/
|
||||
#define __mlog_cpu_guess ({ \
|
||||
unsigned long _cpu = get_cpu(); \
|
||||
put_cpu(); \
|
||||
_cpu; \
|
||||
})
|
||||
|
||||
/* In the following two macros, the whitespace after the ',' just
|
||||
* before ##args is intentional. Otherwise, gcc 2.95 will eat the
|
||||
* previous token if args expands to nothing.
|
||||
*/
|
||||
#define __mlog_printk(level, fmt, args...) \
|
||||
printk(level "(%u,%lu):%s:%d " fmt, current->pid, \
|
||||
__mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \
|
||||
##args)
|
||||
|
||||
#define mlog(mask, fmt, args...) do { \
|
||||
u64 __m = MLOG_MASK_PREFIX | (mask); \
|
||||
if (__mlog_test_u64(__m, mlog_and_bits) && \
|
||||
!__mlog_test_u64(__m, mlog_not_bits)) { \
|
||||
if (__m & ML_ERROR) \
|
||||
__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \
|
||||
else if (__m & ML_NOTICE) \
|
||||
__mlog_printk(KERN_NOTICE, fmt , ##args); \
|
||||
else __mlog_printk(KERN_INFO, fmt , ##args); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define mlog_errno(st) do { \
|
||||
int _st = (st); \
|
||||
if (_st != -ERESTARTSYS && _st != -EINTR && \
|
||||
_st != AOP_TRUNCATED_PAGE) \
|
||||
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
|
||||
} while (0)
|
||||
|
||||
#define mlog_entry(fmt, args...) do { \
|
||||
mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
|
||||
} while (0)
|
||||
|
||||
#define mlog_entry_void() do { \
|
||||
mlog(ML_ENTRY, "ENTRY:\n"); \
|
||||
} while (0)
|
||||
|
||||
/* We disable this for old compilers since they don't have support for
|
||||
* __builtin_types_compatible_p.
|
||||
*/
|
||||
#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) && \
|
||||
!defined(__CHECKER__)
|
||||
#define mlog_exit(st) do { \
|
||||
if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
|
||||
mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
|
||||
else if (__builtin_types_compatible_p(typeof(st), signed long)) \
|
||||
mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
|
||||
else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
|
||||
|| __builtin_types_compatible_p(typeof(st), unsigned short) \
|
||||
|| __builtin_types_compatible_p(typeof(st), unsigned char)) \
|
||||
mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
|
||||
else if (__builtin_types_compatible_p(typeof(st), signed int) \
|
||||
|| __builtin_types_compatible_p(typeof(st), signed short) \
|
||||
|| __builtin_types_compatible_p(typeof(st), signed char)) \
|
||||
mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
|
||||
else if (__builtin_types_compatible_p(typeof(st), long long)) \
|
||||
mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
|
||||
else \
|
||||
mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
|
||||
} while (0)
|
||||
#else
|
||||
#define mlog_exit(st) do { \
|
||||
mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define mlog_exit_ptr(ptr) do { \
|
||||
mlog(ML_EXIT, "EXIT: %p\n", ptr); \
|
||||
} while (0)
|
||||
|
||||
#define mlog_exit_void() do { \
|
||||
mlog(ML_EXIT, "EXIT\n"); \
|
||||
} while (0)
|
||||
|
||||
#define mlog_bug_on_msg(cond, fmt, args...) do { \
|
||||
if (cond) { \
|
||||
mlog(ML_ERROR, "bug expression: " #cond "\n"); \
|
||||
mlog(ML_ERROR, fmt, ##args); \
|
||||
BUG(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64)
|
||||
#define MLFi64 "lld"
|
||||
#define MLFu64 "llu"
|
||||
#define MLFx64 "llx"
|
||||
#else
|
||||
#define MLFi64 "ld"
|
||||
#define MLFu64 "lu"
|
||||
#define MLFx64 "lx"
|
||||
#endif
|
||||
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/sysfs.h>
|
||||
int mlog_sys_init(struct subsystem *o2cb_subsys);
|
||||
void mlog_sys_shutdown(void);
|
||||
|
||||
#endif /* O2CLUSTER_MASKLOG_H */
|
791
fs/ocfs2/cluster/nodemanager.c
Normal file
791
fs/ocfs2/cluster/nodemanager.c
Normal file
@ -0,0 +1,791 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2004, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/configfs.h>
|
||||
|
||||
#include "endian.h"
|
||||
#include "tcp.h"
|
||||
#include "nodemanager.h"
|
||||
#include "heartbeat.h"
|
||||
#include "masklog.h"
|
||||
#include "sys.h"
|
||||
#include "ver.h"
|
||||
|
||||
/* for now we operate under the assertion that there can be only one
|
||||
* cluster active at a time. Changing this will require trickling
|
||||
* cluster references throughout where nodes are looked up */
|
||||
static struct o2nm_cluster *o2nm_single_cluster = NULL;
|
||||
|
||||
#define OCFS2_MAX_HB_CTL_PATH 256
|
||||
static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
|
||||
|
||||
static ctl_table ocfs2_nm_table[] = {
|
||||
{
|
||||
.ctl_name = 1,
|
||||
.procname = "hb_ctl_path",
|
||||
.data = ocfs2_hb_ctl_path,
|
||||
.maxlen = OCFS2_MAX_HB_CTL_PATH,
|
||||
.mode = 0644,
|
||||
.proc_handler = &proc_dostring,
|
||||
.strategy = &sysctl_string,
|
||||
},
|
||||
{ .ctl_name = 0 }
|
||||
};
|
||||
|
||||
static ctl_table ocfs2_mod_table[] = {
|
||||
{
|
||||
.ctl_name = KERN_OCFS2_NM,
|
||||
.procname = "nm",
|
||||
.data = NULL,
|
||||
.maxlen = 0,
|
||||
.mode = 0555,
|
||||
.child = ocfs2_nm_table
|
||||
},
|
||||
{ .ctl_name = 0}
|
||||
};
|
||||
|
||||
static ctl_table ocfs2_kern_table[] = {
|
||||
{
|
||||
.ctl_name = KERN_OCFS2,
|
||||
.procname = "ocfs2",
|
||||
.data = NULL,
|
||||
.maxlen = 0,
|
||||
.mode = 0555,
|
||||
.child = ocfs2_mod_table
|
||||
},
|
||||
{ .ctl_name = 0}
|
||||
};
|
||||
|
||||
static ctl_table ocfs2_root_table[] = {
|
||||
{
|
||||
.ctl_name = CTL_FS,
|
||||
.procname = "fs",
|
||||
.data = NULL,
|
||||
.maxlen = 0,
|
||||
.mode = 0555,
|
||||
.child = ocfs2_kern_table
|
||||
},
|
||||
{ .ctl_name = 0 }
|
||||
};
|
||||
|
||||
static struct ctl_table_header *ocfs2_table_header = NULL;
|
||||
|
||||
const char *o2nm_get_hb_ctl_path(void)
|
||||
{
|
||||
return ocfs2_hb_ctl_path;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
|
||||
|
||||
struct o2nm_cluster {
|
||||
struct config_group cl_group;
|
||||
unsigned cl_has_local:1;
|
||||
u8 cl_local_node;
|
||||
rwlock_t cl_nodes_lock;
|
||||
struct o2nm_node *cl_nodes[O2NM_MAX_NODES];
|
||||
struct rb_root cl_node_ip_tree;
|
||||
/* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
|
||||
unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
};
|
||||
|
||||
struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
|
||||
{
|
||||
struct o2nm_node *node = NULL;
|
||||
|
||||
if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL)
|
||||
goto out;
|
||||
|
||||
read_lock(&o2nm_single_cluster->cl_nodes_lock);
|
||||
node = o2nm_single_cluster->cl_nodes[node_num];
|
||||
if (node)
|
||||
config_item_get(&node->nd_item);
|
||||
read_unlock(&o2nm_single_cluster->cl_nodes_lock);
|
||||
out:
|
||||
return node;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_get_node_by_num);
|
||||
|
||||
int o2nm_configured_node_map(unsigned long *map, unsigned bytes)
|
||||
{
|
||||
struct o2nm_cluster *cluster = o2nm_single_cluster;
|
||||
|
||||
BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap)));
|
||||
|
||||
if (cluster == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
read_lock(&cluster->cl_nodes_lock);
|
||||
memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap));
|
||||
read_unlock(&cluster->cl_nodes_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_configured_node_map);
|
||||
|
||||
static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
|
||||
__be32 ip_needle,
|
||||
struct rb_node ***ret_p,
|
||||
struct rb_node **ret_parent)
|
||||
{
|
||||
struct rb_node **p = &cluster->cl_node_ip_tree.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct o2nm_node *node, *ret = NULL;
|
||||
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
node = rb_entry(parent, struct o2nm_node, nd_ip_node);
|
||||
|
||||
if (memcmp(&ip_needle, &node->nd_ipv4_address,
|
||||
sizeof(ip_needle)) < 0)
|
||||
p = &(*p)->rb_left;
|
||||
else if (memcmp(&ip_needle, &node->nd_ipv4_address,
|
||||
sizeof(ip_needle)) > 0)
|
||||
p = &(*p)->rb_right;
|
||||
else {
|
||||
ret = node;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret_p != NULL)
|
||||
*ret_p = p;
|
||||
if (ret_parent != NULL)
|
||||
*ret_parent = parent;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct o2nm_node *o2nm_get_node_by_ip(__be32 addr)
|
||||
{
|
||||
struct o2nm_node *node = NULL;
|
||||
struct o2nm_cluster *cluster = o2nm_single_cluster;
|
||||
|
||||
if (cluster == NULL)
|
||||
goto out;
|
||||
|
||||
read_lock(&cluster->cl_nodes_lock);
|
||||
node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL);
|
||||
if (node)
|
||||
config_item_get(&node->nd_item);
|
||||
read_unlock(&cluster->cl_nodes_lock);
|
||||
|
||||
out:
|
||||
return node;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip);
|
||||
|
||||
void o2nm_node_put(struct o2nm_node *node)
|
||||
{
|
||||
config_item_put(&node->nd_item);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_node_put);
|
||||
|
||||
void o2nm_node_get(struct o2nm_node *node)
|
||||
{
|
||||
config_item_get(&node->nd_item);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_node_get);
|
||||
|
||||
u8 o2nm_this_node(void)
|
||||
{
|
||||
u8 node_num = O2NM_MAX_NODES;
|
||||
|
||||
if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local)
|
||||
node_num = o2nm_single_cluster->cl_local_node;
|
||||
|
||||
return node_num;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(o2nm_this_node);
|
||||
|
||||
/* node configfs bits */
|
||||
|
||||
static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item)
|
||||
{
|
||||
return item ?
|
||||
container_of(to_config_group(item), struct o2nm_cluster,
|
||||
cl_group)
|
||||
: NULL;
|
||||
}
|
||||
|
||||
static struct o2nm_node *to_o2nm_node(struct config_item *item)
|
||||
{
|
||||
return item ? container_of(item, struct o2nm_node, nd_item) : NULL;
|
||||
}
|
||||
|
||||
static void o2nm_node_release(struct config_item *item)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
kfree(node);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", node->nd_num);
|
||||
}
|
||||
|
||||
static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node)
|
||||
{
|
||||
/* through the first node_set .parent
|
||||
* mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */
|
||||
return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent);
|
||||
}
|
||||
|
||||
enum {
|
||||
O2NM_NODE_ATTR_NUM = 0,
|
||||
O2NM_NODE_ATTR_PORT,
|
||||
O2NM_NODE_ATTR_ADDRESS,
|
||||
O2NM_NODE_ATTR_LOCAL,
|
||||
};
|
||||
|
||||
static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp >= O2NM_MAX_NODES)
|
||||
return -ERANGE;
|
||||
|
||||
/* once we're in the cl_nodes tree networking can look us up by
|
||||
* node number and try to use our address and port attributes
|
||||
* to connect to this node.. make sure that they've been set
|
||||
* before writing the node attribute? */
|
||||
if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
|
||||
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
|
||||
return -EINVAL; /* XXX */
|
||||
|
||||
write_lock(&cluster->cl_nodes_lock);
|
||||
if (cluster->cl_nodes[tmp])
|
||||
p = NULL;
|
||||
else {
|
||||
cluster->cl_nodes[tmp] = node;
|
||||
node->nd_num = tmp;
|
||||
set_bit(tmp, cluster->cl_nodes_bitmap);
|
||||
}
|
||||
write_unlock(&cluster->cl_nodes_lock);
|
||||
if (p == NULL)
|
||||
return -EEXIST;
|
||||
|
||||
return count;
|
||||
}
|
||||
static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port));
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
if (tmp == 0)
|
||||
return -EINVAL;
|
||||
if (tmp >= (u16)-1)
|
||||
return -ERANGE;
|
||||
|
||||
node->nd_ipv4_port = htons(tmp);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address));
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node,
|
||||
const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
|
||||
int ret, i;
|
||||
struct rb_node **p, *parent;
|
||||
unsigned int octets[4];
|
||||
__be32 ipv4_addr = 0;
|
||||
|
||||
ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2],
|
||||
&octets[1], &octets[0]);
|
||||
if (ret != 4)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(octets); i++) {
|
||||
if (octets[i] > 255)
|
||||
return -ERANGE;
|
||||
be32_add_cpu(&ipv4_addr, octets[i] << (i * 8));
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
write_lock(&cluster->cl_nodes_lock);
|
||||
if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent))
|
||||
ret = -EEXIST;
|
||||
else {
|
||||
rb_link_node(&node->nd_ip_node, parent, p);
|
||||
rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree);
|
||||
}
|
||||
write_unlock(&cluster->cl_nodes_lock);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr));
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", node->nd_local);
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node);
|
||||
unsigned long tmp;
|
||||
char *p = (char *)page;
|
||||
ssize_t ret;
|
||||
|
||||
tmp = simple_strtoul(p, &p, 0);
|
||||
if (!p || (*p && (*p != '\n')))
|
||||
return -EINVAL;
|
||||
|
||||
tmp = !!tmp; /* boolean of whether this node wants to be local */
|
||||
|
||||
/* setting local turns on networking rx for now so we require having
|
||||
* set everything else first */
|
||||
if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) ||
|
||||
!test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) ||
|
||||
!test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes))
|
||||
return -EINVAL; /* XXX */
|
||||
|
||||
/* the only failure case is trying to set a new local node
|
||||
* when a different one is already set */
|
||||
if (tmp && tmp == cluster->cl_has_local &&
|
||||
cluster->cl_local_node != node->nd_num)
|
||||
return -EBUSY;
|
||||
|
||||
/* bring up the rx thread if we're setting the new local node. */
|
||||
if (tmp && !cluster->cl_has_local) {
|
||||
ret = o2net_start_listening(node);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (!tmp && cluster->cl_has_local &&
|
||||
cluster->cl_local_node == node->nd_num) {
|
||||
o2net_stop_listening(node);
|
||||
cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
|
||||
}
|
||||
|
||||
node->nd_local = tmp;
|
||||
if (node->nd_local) {
|
||||
cluster->cl_has_local = tmp;
|
||||
cluster->cl_local_node = node->nd_num;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
struct o2nm_node_attribute {
|
||||
struct configfs_attribute attr;
|
||||
ssize_t (*show)(struct o2nm_node *, char *);
|
||||
ssize_t (*store)(struct o2nm_node *, const char *, size_t);
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_num = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "num",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_num_read,
|
||||
.store = o2nm_node_num_write,
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "ipv4_port",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_ipv4_port_read,
|
||||
.store = o2nm_node_ipv4_port_write,
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "ipv4_address",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_ipv4_address_read,
|
||||
.store = o2nm_node_ipv4_address_write,
|
||||
};
|
||||
|
||||
static struct o2nm_node_attribute o2nm_node_attr_local = {
|
||||
.attr = { .ca_owner = THIS_MODULE,
|
||||
.ca_name = "local",
|
||||
.ca_mode = S_IRUGO | S_IWUSR },
|
||||
.show = o2nm_node_local_read,
|
||||
.store = o2nm_node_local_write,
|
||||
};
|
||||
|
||||
static struct configfs_attribute *o2nm_node_attrs[] = {
|
||||
[O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr,
|
||||
[O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr,
|
||||
[O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr,
|
||||
[O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int o2nm_attr_index(struct configfs_attribute *attr)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) {
|
||||
if (attr == o2nm_node_attrs[i])
|
||||
return i;
|
||||
}
|
||||
BUG();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_show(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
char *page)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
struct o2nm_node_attribute *o2nm_node_attr =
|
||||
container_of(attr, struct o2nm_node_attribute, attr);
|
||||
ssize_t ret = 0;
|
||||
|
||||
if (o2nm_node_attr->show)
|
||||
ret = o2nm_node_attr->show(node, page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t o2nm_node_store(struct config_item *item,
|
||||
struct configfs_attribute *attr,
|
||||
const char *page, size_t count)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
struct o2nm_node_attribute *o2nm_node_attr =
|
||||
container_of(attr, struct o2nm_node_attribute, attr);
|
||||
ssize_t ret;
|
||||
int attr_index = o2nm_attr_index(attr);
|
||||
|
||||
if (o2nm_node_attr->store == NULL) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (test_bit(attr_index, &node->nd_set_attributes))
|
||||
return -EBUSY;
|
||||
|
||||
ret = o2nm_node_attr->store(node, page, count);
|
||||
if (ret < count)
|
||||
goto out;
|
||||
|
||||
set_bit(attr_index, &node->nd_set_attributes);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct configfs_item_operations o2nm_node_item_ops = {
|
||||
.release = o2nm_node_release,
|
||||
.show_attribute = o2nm_node_show,
|
||||
.store_attribute = o2nm_node_store,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_node_type = {
|
||||
.ct_item_ops = &o2nm_node_item_ops,
|
||||
.ct_attrs = o2nm_node_attrs,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* node set */
|
||||
|
||||
struct o2nm_node_group {
|
||||
struct config_group ns_group;
|
||||
/* some stuff? */
|
||||
};
|
||||
|
||||
#if 0
|
||||
static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group)
|
||||
{
|
||||
return group ?
|
||||
container_of(group, struct o2nm_node_group, ns_group)
|
||||
: NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct config_item *o2nm_node_group_make_item(struct config_group *group,
|
||||
const char *name)
|
||||
{
|
||||
struct o2nm_node *node = NULL;
|
||||
struct config_item *ret = NULL;
|
||||
|
||||
if (strlen(name) > O2NM_MAX_NAME_LEN)
|
||||
goto out; /* ENAMETOOLONG */
|
||||
|
||||
node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL);
|
||||
if (node == NULL)
|
||||
goto out; /* ENOMEM */
|
||||
|
||||
strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
|
||||
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
|
||||
spin_lock_init(&node->nd_lock);
|
||||
|
||||
ret = &node->nd_item;
|
||||
|
||||
out:
|
||||
if (ret == NULL)
|
||||
kfree(node);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void o2nm_node_group_drop_item(struct config_group *group,
|
||||
struct config_item *item)
|
||||
{
|
||||
struct o2nm_node *node = to_o2nm_node(item);
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent);
|
||||
|
||||
o2net_disconnect_node(node);
|
||||
|
||||
if (cluster->cl_has_local &&
|
||||
(cluster->cl_local_node == node->nd_num)) {
|
||||
cluster->cl_has_local = 0;
|
||||
cluster->cl_local_node = O2NM_INVALID_NODE_NUM;
|
||||
o2net_stop_listening(node);
|
||||
}
|
||||
|
||||
/* XXX call into net to stop this node from trading messages */
|
||||
|
||||
write_lock(&cluster->cl_nodes_lock);
|
||||
|
||||
/* XXX sloppy */
|
||||
if (node->nd_ipv4_address)
|
||||
rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree);
|
||||
|
||||
/* nd_num might be 0 if the node number hasn't been set.. */
|
||||
if (cluster->cl_nodes[node->nd_num] == node) {
|
||||
cluster->cl_nodes[node->nd_num] = NULL;
|
||||
clear_bit(node->nd_num, cluster->cl_nodes_bitmap);
|
||||
}
|
||||
write_unlock(&cluster->cl_nodes_lock);
|
||||
|
||||
config_item_put(item);
|
||||
}
|
||||
|
||||
static struct configfs_group_operations o2nm_node_group_group_ops = {
|
||||
.make_item = o2nm_node_group_make_item,
|
||||
.drop_item = o2nm_node_group_drop_item,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_node_group_type = {
|
||||
.ct_group_ops = &o2nm_node_group_group_ops,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* cluster */
|
||||
|
||||
static void o2nm_cluster_release(struct config_item *item)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
|
||||
|
||||
kfree(cluster->cl_group.default_groups);
|
||||
kfree(cluster);
|
||||
}
|
||||
|
||||
static struct configfs_item_operations o2nm_cluster_item_ops = {
|
||||
.release = o2nm_cluster_release,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_cluster_type = {
|
||||
.ct_item_ops = &o2nm_cluster_item_ops,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* cluster set */
|
||||
|
||||
struct o2nm_cluster_group {
|
||||
struct configfs_subsystem cs_subsys;
|
||||
/* some stuff? */
|
||||
};
|
||||
|
||||
#if 0
|
||||
static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group)
|
||||
{
|
||||
return group ?
|
||||
container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys)
|
||||
: NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct config_group *o2nm_cluster_group_make_group(struct config_group *group,
|
||||
const char *name)
|
||||
{
|
||||
struct o2nm_cluster *cluster = NULL;
|
||||
struct o2nm_node_group *ns = NULL;
|
||||
struct config_group *o2hb_group = NULL, *ret = NULL;
|
||||
void *defs = NULL;
|
||||
|
||||
/* this runs under the parent dir's i_sem; there can be only
|
||||
* one caller in here at a time */
|
||||
if (o2nm_single_cluster)
|
||||
goto out; /* ENOSPC */
|
||||
|
||||
cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL);
|
||||
ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL);
|
||||
defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
|
||||
o2hb_group = o2hb_alloc_hb_set();
|
||||
if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL)
|
||||
goto out;
|
||||
|
||||
config_group_init_type_name(&cluster->cl_group, name,
|
||||
&o2nm_cluster_type);
|
||||
config_group_init_type_name(&ns->ns_group, "node",
|
||||
&o2nm_node_group_type);
|
||||
|
||||
cluster->cl_group.default_groups = defs;
|
||||
cluster->cl_group.default_groups[0] = &ns->ns_group;
|
||||
cluster->cl_group.default_groups[1] = o2hb_group;
|
||||
cluster->cl_group.default_groups[2] = NULL;
|
||||
rwlock_init(&cluster->cl_nodes_lock);
|
||||
cluster->cl_node_ip_tree = RB_ROOT;
|
||||
|
||||
ret = &cluster->cl_group;
|
||||
o2nm_single_cluster = cluster;
|
||||
|
||||
out:
|
||||
if (ret == NULL) {
|
||||
kfree(cluster);
|
||||
kfree(ns);
|
||||
o2hb_free_hb_set(o2hb_group);
|
||||
kfree(defs);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item)
|
||||
{
|
||||
struct o2nm_cluster *cluster = to_o2nm_cluster(item);
|
||||
int i;
|
||||
struct config_item *killme;
|
||||
|
||||
BUG_ON(o2nm_single_cluster != cluster);
|
||||
o2nm_single_cluster = NULL;
|
||||
|
||||
for (i = 0; cluster->cl_group.default_groups[i]; i++) {
|
||||
killme = &cluster->cl_group.default_groups[i]->cg_item;
|
||||
cluster->cl_group.default_groups[i] = NULL;
|
||||
config_item_put(killme);
|
||||
}
|
||||
|
||||
config_item_put(item);
|
||||
}
|
||||
|
||||
static struct configfs_group_operations o2nm_cluster_group_group_ops = {
|
||||
.make_group = o2nm_cluster_group_make_group,
|
||||
.drop_item = o2nm_cluster_group_drop_item,
|
||||
};
|
||||
|
||||
static struct config_item_type o2nm_cluster_group_type = {
|
||||
.ct_group_ops = &o2nm_cluster_group_group_ops,
|
||||
.ct_owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct o2nm_cluster_group o2nm_cluster_group = {
|
||||
.cs_subsys = {
|
||||
.su_group = {
|
||||
.cg_item = {
|
||||
.ci_namebuf = "cluster",
|
||||
.ci_type = &o2nm_cluster_group_type,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
static void __exit exit_o2nm(void)
|
||||
{
|
||||
if (ocfs2_table_header)
|
||||
unregister_sysctl_table(ocfs2_table_header);
|
||||
|
||||
/* XXX sync with hb callbacks and shut down hb? */
|
||||
o2net_unregister_hb_callbacks();
|
||||
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
|
||||
o2cb_sys_shutdown();
|
||||
|
||||
o2net_exit();
|
||||
}
|
||||
|
||||
static int __init init_o2nm(void)
|
||||
{
|
||||
int ret = -1;
|
||||
|
||||
cluster_print_version();
|
||||
|
||||
o2hb_init();
|
||||
o2net_init();
|
||||
|
||||
ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0);
|
||||
if (!ocfs2_table_header) {
|
||||
printk(KERN_ERR "nodemanager: unable to register sysctl\n");
|
||||
ret = -ENOMEM; /* or something. */
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = o2net_register_hb_callbacks();
|
||||
if (ret)
|
||||
goto out_sysctl;
|
||||
|
||||
config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
|
||||
init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
|
||||
ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
|
||||
if (ret) {
|
||||
printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
|
||||
goto out_callbacks;
|
||||
}
|
||||
|
||||
ret = o2cb_sys_init();
|
||||
if (!ret)
|
||||
goto out;
|
||||
|
||||
configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
|
||||
out_callbacks:
|
||||
o2net_unregister_hb_callbacks();
|
||||
out_sysctl:
|
||||
unregister_sysctl_table(ocfs2_table_header);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
MODULE_AUTHOR("Oracle");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_o2nm)
|
||||
module_exit(exit_o2nm)
|
64
fs/ocfs2/cluster/nodemanager.h
Normal file
64
fs/ocfs2/cluster/nodemanager.h
Normal file
@ -0,0 +1,64 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* nodemanager.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_NODEMANAGER_H
|
||||
#define O2CLUSTER_NODEMANAGER_H
|
||||
|
||||
#include "ocfs2_nodemanager.h"
|
||||
|
||||
/* This totally doesn't belong here. */
|
||||
#include <linux/configfs.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
#define KERN_OCFS2 988
|
||||
#define KERN_OCFS2_NM 1
|
||||
|
||||
const char *o2nm_get_hb_ctl_path(void);
|
||||
|
||||
struct o2nm_node {
|
||||
spinlock_t nd_lock;
|
||||
struct config_item nd_item;
|
||||
char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */
|
||||
__u8 nd_num;
|
||||
/* only one address per node, as attributes, for now. */
|
||||
__be32 nd_ipv4_address;
|
||||
__be16 nd_ipv4_port;
|
||||
struct rb_node nd_ip_node;
|
||||
/* there can be only one local node for now */
|
||||
int nd_local;
|
||||
|
||||
unsigned long nd_set_attributes;
|
||||
};
|
||||
|
||||
u8 o2nm_this_node(void);
|
||||
|
||||
int o2nm_configured_node_map(unsigned long *map, unsigned bytes);
|
||||
struct o2nm_node *o2nm_get_node_by_num(u8 node_num);
|
||||
struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
|
||||
void o2nm_node_get(struct o2nm_node *node);
|
||||
void o2nm_node_put(struct o2nm_node *node);
|
||||
|
||||
#endif /* O2CLUSTER_NODEMANAGER_H */
|
37
fs/ocfs2/cluster/ocfs2_heartbeat.h
Normal file
37
fs/ocfs2/cluster/ocfs2_heartbeat.h
Normal file
@ -0,0 +1,37 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_heartbeat.h
|
||||
*
|
||||
* On-disk structures for ocfs2_heartbeat
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _OCFS2_HEARTBEAT_H
|
||||
#define _OCFS2_HEARTBEAT_H
|
||||
|
||||
struct o2hb_disk_heartbeat_block {
|
||||
__le64 hb_seq;
|
||||
__u8 hb_node;
|
||||
__u8 hb_pad1[3];
|
||||
__le32 hb_cksum;
|
||||
__le64 hb_generation;
|
||||
};
|
||||
|
||||
#endif /* _OCFS2_HEARTBEAT_H */
|
39
fs/ocfs2/cluster/ocfs2_nodemanager.h
Normal file
39
fs/ocfs2/cluster/ocfs2_nodemanager.h
Normal file
@ -0,0 +1,39 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_nodemanager.h
|
||||
*
|
||||
* Header describing the interface between userspace and the kernel
|
||||
* for the ocfs2_nodemanager module.
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _OCFS2_NODEMANAGER_H
|
||||
#define _OCFS2_NODEMANAGER_H
|
||||
|
||||
#define O2NM_API_VERSION 5
|
||||
|
||||
#define O2NM_MAX_NODES 255
|
||||
#define O2NM_INVALID_NODE_NUM 255
|
||||
|
||||
/* host name, group name, cluster name all 64 bytes */
|
||||
#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN
|
||||
|
||||
#endif /* _OCFS2_NODEMANAGER_H */
|
315
fs/ocfs2/cluster/quorum.c
Normal file
315
fs/ocfs2/cluster/quorum.c
Normal file
@ -0,0 +1,315 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
*
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
/* This quorum hack is only here until we transition to some more rational
|
||||
* approach that is driven from userspace. Honest. No foolin'.
|
||||
*
|
||||
* Imagine two nodes lose network connectivity to each other but they're still
|
||||
* up and operating in every other way. Presumably a network timeout indicates
|
||||
* that a node is broken and should be recovered. They can't both recover each
|
||||
* other and both carry on without serialising their access to the file system.
|
||||
* They need to decide who is authoritative. Now extend that problem to
|
||||
* arbitrary groups of nodes losing connectivity between each other.
|
||||
*
|
||||
* So we declare that a node which has given up on connecting to a majority
|
||||
* of nodes who are still heartbeating will fence itself.
|
||||
*
|
||||
* There are huge opportunities for races here. After we give up on a node's
|
||||
* connection we need to wait long enough to give heartbeat an opportunity
|
||||
* to declare the node as truly dead. We also need to be careful with the
|
||||
* race between when we see a node start heartbeating and when we connect
|
||||
* to it.
|
||||
*
|
||||
* So nodes that are in this transtion put a hold on the quorum decision
|
||||
* with a counter. As they fall out of this transition they drop the count
|
||||
* and if they're the last, they fire off the decision.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include "heartbeat.h"
|
||||
#include "nodemanager.h"
|
||||
#define MLOG_MASK_PREFIX ML_QUORUM
|
||||
#include "masklog.h"
|
||||
#include "quorum.h"
|
||||
|
||||
static struct o2quo_state {
|
||||
spinlock_t qs_lock;
|
||||
struct work_struct qs_work;
|
||||
int qs_pending;
|
||||
int qs_heartbeating;
|
||||
unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
int qs_connected;
|
||||
unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
int qs_holds;
|
||||
unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
} o2quo_state;
|
||||
|
||||
/* this is horribly heavy-handed. It should instead flip the file
|
||||
* system RO and call some userspace script. */
|
||||
static void o2quo_fence_self(void)
|
||||
{
|
||||
/* panic spins with interrupts enabled. with preempt
|
||||
* threads can still schedule, etc, etc */
|
||||
o2hb_stop_all_regions();
|
||||
panic("ocfs2 is very sorry to be fencing this system by panicing\n");
|
||||
}
|
||||
|
||||
/* Indicate that a timeout occured on a hearbeat region write. The
|
||||
* other nodes in the cluster may consider us dead at that time so we
|
||||
* want to "fence" ourselves so that we don't scribble on the disk
|
||||
* after they think they've recovered us. This can't solve all
|
||||
* problems related to writeout after recovery but this hack can at
|
||||
* least close some of those gaps. When we have real fencing, this can
|
||||
* go away as our node would be fenced externally before other nodes
|
||||
* begin recovery. */
|
||||
void o2quo_disk_timeout(void)
|
||||
{
|
||||
o2quo_fence_self();
|
||||
}
|
||||
|
||||
static void o2quo_make_decision(void *arg)
|
||||
{
|
||||
int quorum;
|
||||
int lowest_hb, lowest_reachable = 0, fence = 0;
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
|
||||
if (lowest_hb != O2NM_MAX_NODES)
|
||||
lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm);
|
||||
|
||||
mlog(0, "heartbeating: %d, connected: %d, "
|
||||
"lowest: %d (%sreachable)\n", qs->qs_heartbeating,
|
||||
qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un");
|
||||
|
||||
if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) ||
|
||||
qs->qs_heartbeating == 1)
|
||||
goto out;
|
||||
|
||||
if (qs->qs_heartbeating & 1) {
|
||||
/* the odd numbered cluster case is straight forward --
|
||||
* if we can't talk to the majority we're hosed */
|
||||
quorum = (qs->qs_heartbeating + 1)/2;
|
||||
if (qs->qs_connected < quorum) {
|
||||
mlog(ML_ERROR, "fencing this node because it is "
|
||||
"only connected to %u nodes and %u is needed "
|
||||
"to make a quorum out of %u heartbeating nodes\n",
|
||||
qs->qs_connected, quorum,
|
||||
qs->qs_heartbeating);
|
||||
fence = 1;
|
||||
}
|
||||
} else {
|
||||
/* the even numbered cluster adds the possibility of each half
|
||||
* of the cluster being able to talk amongst themselves.. in
|
||||
* that case we're hosed if we can't talk to the group that has
|
||||
* the lowest numbered node */
|
||||
quorum = qs->qs_heartbeating / 2;
|
||||
if (qs->qs_connected < quorum) {
|
||||
mlog(ML_ERROR, "fencing this node because it is "
|
||||
"only connected to %u nodes and %u is needed "
|
||||
"to make a quorum out of %u heartbeating nodes\n",
|
||||
qs->qs_connected, quorum,
|
||||
qs->qs_heartbeating);
|
||||
fence = 1;
|
||||
}
|
||||
else if ((qs->qs_connected == quorum) &&
|
||||
!lowest_reachable) {
|
||||
mlog(ML_ERROR, "fencing this node because it is "
|
||||
"connected to a half-quorum of %u out of %u "
|
||||
"nodes which doesn't include the lowest active "
|
||||
"node %u\n", quorum, qs->qs_heartbeating,
|
||||
lowest_hb);
|
||||
fence = 1;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock(&qs->qs_lock);
|
||||
if (fence)
|
||||
o2quo_fence_self();
|
||||
}
|
||||
|
||||
static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
|
||||
{
|
||||
assert_spin_locked(&qs->qs_lock);
|
||||
|
||||
if (!test_and_set_bit(node, qs->qs_hold_bm)) {
|
||||
qs->qs_holds++;
|
||||
mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES,
|
||||
"node %u\n", node);
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_holds);
|
||||
}
|
||||
}
|
||||
|
||||
static void o2quo_clear_hold(struct o2quo_state *qs, u8 node)
|
||||
{
|
||||
assert_spin_locked(&qs->qs_lock);
|
||||
|
||||
if (test_and_clear_bit(node, qs->qs_hold_bm)) {
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1);
|
||||
if (--qs->qs_holds == 0) {
|
||||
if (qs->qs_pending) {
|
||||
qs->qs_pending = 0;
|
||||
schedule_work(&qs->qs_work);
|
||||
}
|
||||
}
|
||||
mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n",
|
||||
node, qs->qs_holds);
|
||||
}
|
||||
}
|
||||
|
||||
/* as a node comes up we delay the quorum decision until we know the fate of
|
||||
* the connection. the hold will be droped in conn_up or hb_down. it might be
|
||||
* perpetuated by con_err until hb_down. if we already have a conn, we might
|
||||
* be dropping a hold that conn_up got. */
|
||||
void o2quo_hb_up(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
qs->qs_heartbeating++;
|
||||
mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
|
||||
"node %u\n", node);
|
||||
mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node);
|
||||
set_bit(node, qs->qs_hb_bm);
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
|
||||
|
||||
if (!test_bit(node, qs->qs_conn_bm))
|
||||
o2quo_set_hold(qs, node);
|
||||
else
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* hb going down releases any holds we might have had due to this node from
|
||||
* conn_up, conn_err, or hb_up */
|
||||
void o2quo_hb_down(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
qs->qs_heartbeating--;
|
||||
mlog_bug_on_msg(qs->qs_heartbeating < 0,
|
||||
"node %u, %d heartbeating\n",
|
||||
node, qs->qs_heartbeating);
|
||||
mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node);
|
||||
clear_bit(node, qs->qs_hb_bm);
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating);
|
||||
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* this tells us that we've decided that the node is still heartbeating
|
||||
* even though we've lost it's conn. it must only be called after conn_err
|
||||
* and indicates that we must now make a quorum decision in the future,
|
||||
* though we might be doing so after waiting for holds to drain. Here
|
||||
* we'll be dropping the hold from conn_err. */
|
||||
void o2quo_hb_still_up(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
mlog(0, "node %u\n", node);
|
||||
|
||||
qs->qs_pending = 1;
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* This is analagous to hb_up. as a node's connection comes up we delay the
|
||||
* quorum decision until we see it heartbeating. the hold will be droped in
|
||||
* hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
|
||||
* it's already heartbeating we we might be dropping a hold that conn_up got.
|
||||
* */
|
||||
void o2quo_conn_up(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
qs->qs_connected++;
|
||||
mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
|
||||
"node %u\n", node);
|
||||
mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node);
|
||||
set_bit(node, qs->qs_conn_bm);
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
|
||||
|
||||
if (!test_bit(node, qs->qs_hb_bm))
|
||||
o2quo_set_hold(qs, node);
|
||||
else
|
||||
o2quo_clear_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
/* we've decided that we won't ever be connecting to the node again. if it's
|
||||
* still heartbeating we grab a hold that will delay decisions until either the
|
||||
* node stops heartbeating from hb_down or the caller decides that the node is
|
||||
* still up and calls still_up */
|
||||
void o2quo_conn_err(u8 node)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock(&qs->qs_lock);
|
||||
|
||||
if (test_bit(node, qs->qs_conn_bm)) {
|
||||
qs->qs_connected--;
|
||||
mlog_bug_on_msg(qs->qs_connected < 0,
|
||||
"node %u, connected %d\n",
|
||||
node, qs->qs_connected);
|
||||
|
||||
clear_bit(node, qs->qs_conn_bm);
|
||||
}
|
||||
|
||||
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
|
||||
|
||||
if (test_bit(node, qs->qs_hb_bm))
|
||||
o2quo_set_hold(qs, node);
|
||||
|
||||
spin_unlock(&qs->qs_lock);
|
||||
}
|
||||
|
||||
void o2quo_init(void)
|
||||
{
|
||||
struct o2quo_state *qs = &o2quo_state;
|
||||
|
||||
spin_lock_init(&qs->qs_lock);
|
||||
INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL);
|
||||
}
|
||||
|
||||
void o2quo_exit(void)
|
||||
{
|
||||
flush_scheduled_work();
|
||||
}
|
36
fs/ocfs2/cluster/quorum.h
Normal file
36
fs/ocfs2/cluster/quorum.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_QUORUM_H
|
||||
#define O2CLUSTER_QUORUM_H
|
||||
|
||||
void o2quo_init(void);
|
||||
void o2quo_exit(void);
|
||||
|
||||
void o2quo_hb_up(u8 node);
|
||||
void o2quo_hb_down(u8 node);
|
||||
void o2quo_hb_still_up(u8 node);
|
||||
void o2quo_conn_up(u8 node);
|
||||
void o2quo_conn_err(u8 node);
|
||||
void o2quo_disk_timeout(void);
|
||||
|
||||
#endif /* O2CLUSTER_QUORUM_H */
|
124
fs/ocfs2/cluster/sys.c
Normal file
124
fs/ocfs2/cluster/sys.c
Normal file
@ -0,0 +1,124 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* sys.c
|
||||
*
|
||||
* OCFS2 cluster sysfs interface
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation,
|
||||
* version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/sysfs.h>
|
||||
|
||||
#include "ocfs2_nodemanager.h"
|
||||
#include "masklog.h"
|
||||
#include "sys.h"
|
||||
|
||||
struct o2cb_attribute {
|
||||
struct attribute attr;
|
||||
ssize_t (*show)(char *buf);
|
||||
ssize_t (*store)(const char *buf, size_t count);
|
||||
};
|
||||
|
||||
#define O2CB_ATTR(_name, _mode, _show, _store) \
|
||||
struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store)
|
||||
|
||||
#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset)
|
||||
#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr)
|
||||
|
||||
static ssize_t o2cb_interface_revision_show(char *buf)
|
||||
{
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
|
||||
}
|
||||
|
||||
static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL);
|
||||
|
||||
static struct attribute *o2cb_attrs[] = {
|
||||
&o2cb_attr_interface_revision.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static ssize_t
|
||||
o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer);
|
||||
static ssize_t
|
||||
o2cb_store(struct kobject * kobj, struct attribute * attr,
|
||||
const char * buffer, size_t count);
|
||||
static struct sysfs_ops o2cb_sysfs_ops = {
|
||||
.show = o2cb_show,
|
||||
.store = o2cb_store,
|
||||
};
|
||||
|
||||
static struct kobj_type o2cb_subsys_type = {
|
||||
.default_attrs = o2cb_attrs,
|
||||
.sysfs_ops = &o2cb_sysfs_ops,
|
||||
};
|
||||
|
||||
/* gives us o2cb_subsys */
|
||||
static decl_subsys(o2cb, NULL, NULL);
|
||||
|
||||
static ssize_t
|
||||
o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer)
|
||||
{
|
||||
struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
|
||||
struct subsystem *sbs = to_o2cb_subsys(kobj);
|
||||
|
||||
BUG_ON(sbs != &o2cb_subsys);
|
||||
|
||||
if (o2cb_attr->show)
|
||||
return o2cb_attr->show(buffer);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
o2cb_store(struct kobject * kobj, struct attribute * attr,
|
||||
const char * buffer, size_t count)
|
||||
{
|
||||
struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr);
|
||||
struct subsystem *sbs = to_o2cb_subsys(kobj);
|
||||
|
||||
BUG_ON(sbs != &o2cb_subsys);
|
||||
|
||||
if (o2cb_attr->store)
|
||||
return o2cb_attr->store(buffer, count);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
void o2cb_sys_shutdown(void)
|
||||
{
|
||||
mlog_sys_shutdown();
|
||||
subsystem_unregister(&o2cb_subsys);
|
||||
}
|
||||
|
||||
int o2cb_sys_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type;
|
||||
ret = subsystem_register(&o2cb_subsys);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = mlog_sys_init(&o2cb_subsys);
|
||||
if (ret)
|
||||
subsystem_unregister(&o2cb_subsys);
|
||||
return ret;
|
||||
}
|
33
fs/ocfs2/cluster/sys.h
Normal file
33
fs/ocfs2/cluster/sys.h
Normal file
@ -0,0 +1,33 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* sys.h
|
||||
*
|
||||
* Function prototypes for o2cb sysfs interface
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation,
|
||||
* version 2 of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_SYS_H
|
||||
#define O2CLUSTER_SYS_H
|
||||
|
||||
void o2cb_sys_shutdown(void);
|
||||
int o2cb_sys_init(void);
|
||||
|
||||
#endif /* O2CLUSTER_SYS_H */
|
1829
fs/ocfs2/cluster/tcp.c
Normal file
1829
fs/ocfs2/cluster/tcp.c
Normal file
File diff suppressed because it is too large
Load Diff
113
fs/ocfs2/cluster/tcp.h
Normal file
113
fs/ocfs2/cluster/tcp.h
Normal file
@ -0,0 +1,113 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* tcp.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_TCP_H
|
||||
#define O2CLUSTER_TCP_H
|
||||
|
||||
#include <linux/socket.h>
|
||||
#ifdef __KERNEL__
|
||||
#include <net/sock.h>
|
||||
#include <linux/tcp.h>
|
||||
#else
|
||||
#include <sys/socket.h>
|
||||
#endif
|
||||
#include <linux/inet.h>
|
||||
#include <linux/in.h>
|
||||
|
||||
struct o2net_msg
|
||||
{
|
||||
__be16 magic;
|
||||
__be16 data_len;
|
||||
__be16 msg_type;
|
||||
__be16 pad1;
|
||||
__be32 sys_status;
|
||||
__be32 status;
|
||||
__be32 key;
|
||||
__be32 msg_num;
|
||||
__u8 buf[0];
|
||||
};
|
||||
|
||||
typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data);
|
||||
|
||||
#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg))
|
||||
|
||||
/* TODO: figure this out.... */
|
||||
static inline int o2net_link_down(int err, struct socket *sock)
|
||||
{
|
||||
if (sock) {
|
||||
if (sock->sk->sk_state != TCP_ESTABLISHED &&
|
||||
sock->sk->sk_state != TCP_CLOSE_WAIT)
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (err >= 0)
|
||||
return 0;
|
||||
switch (err) {
|
||||
/* ????????????????????????? */
|
||||
case -ERESTARTSYS:
|
||||
case -EBADF:
|
||||
/* When the server has died, an ICMP port unreachable
|
||||
* message prompts ECONNREFUSED. */
|
||||
case -ECONNREFUSED:
|
||||
case -ENOTCONN:
|
||||
case -ECONNRESET:
|
||||
case -EPIPE:
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
enum {
|
||||
O2NET_DRIVER_UNINITED,
|
||||
O2NET_DRIVER_READY,
|
||||
};
|
||||
|
||||
int o2net_init_tcp_sock(struct inode *inode);
|
||||
int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len,
|
||||
u8 target_node, int *status);
|
||||
int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec,
|
||||
size_t veclen, u8 target_node, int *status);
|
||||
int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len,
|
||||
struct inode *group);
|
||||
|
||||
int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
|
||||
o2net_msg_handler_func *func, void *data,
|
||||
struct list_head *unreg_list);
|
||||
void o2net_unregister_handler_list(struct list_head *list);
|
||||
|
||||
struct o2nm_node;
|
||||
int o2net_register_hb_callbacks(void);
|
||||
void o2net_unregister_hb_callbacks(void);
|
||||
int o2net_start_listening(struct o2nm_node *node);
|
||||
void o2net_stop_listening(struct o2nm_node *node);
|
||||
void o2net_disconnect_node(struct o2nm_node *node);
|
||||
|
||||
int o2net_init(void);
|
||||
void o2net_exit(void);
|
||||
int o2net_proc_init(struct proc_dir_entry *parent);
|
||||
void o2net_proc_exit(struct proc_dir_entry *parent);
|
||||
|
||||
#endif /* O2CLUSTER_TCP_H */
|
174
fs/ocfs2/cluster/tcp_internal.h
Normal file
174
fs/ocfs2/cluster/tcp_internal.h
Normal file
@ -0,0 +1,174 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_TCP_INTERNAL_H
|
||||
#define O2CLUSTER_TCP_INTERNAL_H
|
||||
|
||||
#define O2NET_MSG_MAGIC ((u16)0xfa55)
|
||||
#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56)
|
||||
#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57)
|
||||
#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58)
|
||||
|
||||
/* same as hb delay, we're waiting for another node to recognize our hb */
|
||||
#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS
|
||||
|
||||
/* we're delaying our quorum decision so that heartbeat will have timed
|
||||
* out truly dead nodes by the time we come around to making decisions
|
||||
* on their number */
|
||||
#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS)
|
||||
|
||||
#define O2NET_KEEPALIVE_DELAY_SECS 5
|
||||
#define O2NET_IDLE_TIMEOUT_SECS 10
|
||||
|
||||
/*
|
||||
* This version number represents quite a lot, unfortunately. It not
|
||||
* only represents the raw network message protocol on the wire but also
|
||||
* locking semantics of the file system using the protocol. It should
|
||||
* be somewhere else, I'm sure, but right now it isn't.
|
||||
*
|
||||
* New in version 2:
|
||||
* - full 64 bit i_size in the metadata lock lvbs
|
||||
* - introduction of "rw" lock and pushing meta/data locking down
|
||||
*/
|
||||
#define O2NET_PROTOCOL_VERSION 2ULL
|
||||
struct o2net_handshake {
|
||||
__be64 protocol_version;
|
||||
__be64 connector_id;
|
||||
};
|
||||
|
||||
struct o2net_node {
|
||||
/* this is never called from int/bh */
|
||||
spinlock_t nn_lock;
|
||||
|
||||
/* set the moment an sc is allocated and a connect is started */
|
||||
struct o2net_sock_container *nn_sc;
|
||||
/* _valid is only set after the handshake passes and tx can happen */
|
||||
unsigned nn_sc_valid:1;
|
||||
/* if this is set tx just returns it */
|
||||
int nn_persistent_error;
|
||||
|
||||
/* threads waiting for an sc to arrive wait on the wq for generation
|
||||
* to increase. it is increased when a connecting socket succeeds
|
||||
* or fails or when an accepted socket is attached. */
|
||||
wait_queue_head_t nn_sc_wq;
|
||||
|
||||
struct idr nn_status_idr;
|
||||
struct list_head nn_status_list;
|
||||
|
||||
/* connects are attempted from when heartbeat comes up until either hb
|
||||
* goes down, the node is unconfigured, no connect attempts succeed
|
||||
* before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
|
||||
* is queued from set_nn_state both from hb up and from itself if a
|
||||
* connect attempt fails and so can be self-arming. shutdown is
|
||||
* careful to first mark the nn such that no connects will be attempted
|
||||
* before canceling delayed connect work and flushing the queue. */
|
||||
struct work_struct nn_connect_work;
|
||||
unsigned long nn_last_connect_attempt;
|
||||
|
||||
/* this is queued as nodes come up and is canceled when a connection is
|
||||
* established. this expiring gives up on the node and errors out
|
||||
* transmits */
|
||||
struct work_struct nn_connect_expired;
|
||||
|
||||
/* after we give up on a socket we wait a while before deciding
|
||||
* that it is still heartbeating and that we should do some
|
||||
* quorum work */
|
||||
struct work_struct nn_still_up;
|
||||
};
|
||||
|
||||
struct o2net_sock_container {
|
||||
struct kref sc_kref;
|
||||
/* the next two are vaild for the life time of the sc */
|
||||
struct socket *sc_sock;
|
||||
struct o2nm_node *sc_node;
|
||||
|
||||
/* all of these sc work structs hold refs on the sc while they are
|
||||
* queued. they should not be able to ref a freed sc. the teardown
|
||||
* race is with o2net_wq destruction in o2net_stop_listening() */
|
||||
|
||||
/* rx and connect work are generated from socket callbacks. sc
|
||||
* shutdown removes the callbacks and then flushes the work queue */
|
||||
struct work_struct sc_rx_work;
|
||||
struct work_struct sc_connect_work;
|
||||
/* shutdown work is triggered in two ways. the simple way is
|
||||
* for a code path calls ensure_shutdown which gets a lock, removes
|
||||
* the sc from the nn, and queues the work. in this case the
|
||||
* work is single-shot. the work is also queued from a sock
|
||||
* callback, though, and in this case the work will find the sc
|
||||
* still on the nn and will call ensure_shutdown itself.. this
|
||||
* ends up triggering the shutdown work again, though nothing
|
||||
* will be done in that second iteration. so work queue teardown
|
||||
* has to be careful to remove the sc from the nn before waiting
|
||||
* on the work queue so that the shutdown work doesn't remove the
|
||||
* sc and rearm itself.
|
||||
*/
|
||||
struct work_struct sc_shutdown_work;
|
||||
|
||||
struct timer_list sc_idle_timeout;
|
||||
struct work_struct sc_keepalive_work;
|
||||
|
||||
unsigned sc_handshake_ok:1;
|
||||
|
||||
struct page *sc_page;
|
||||
size_t sc_page_off;
|
||||
|
||||
/* original handlers for the sockets */
|
||||
void (*sc_state_change)(struct sock *sk);
|
||||
void (*sc_data_ready)(struct sock *sk, int bytes);
|
||||
|
||||
struct timeval sc_tv_timer;
|
||||
struct timeval sc_tv_data_ready;
|
||||
struct timeval sc_tv_advance_start;
|
||||
struct timeval sc_tv_advance_stop;
|
||||
struct timeval sc_tv_func_start;
|
||||
struct timeval sc_tv_func_stop;
|
||||
u32 sc_msg_key;
|
||||
u16 sc_msg_type;
|
||||
};
|
||||
|
||||
struct o2net_msg_handler {
|
||||
struct rb_node nh_node;
|
||||
u32 nh_max_len;
|
||||
u32 nh_msg_type;
|
||||
u32 nh_key;
|
||||
o2net_msg_handler_func *nh_func;
|
||||
o2net_msg_handler_func *nh_func_data;
|
||||
struct kref nh_kref;
|
||||
struct list_head nh_unregister_item;
|
||||
};
|
||||
|
||||
enum o2net_system_error {
|
||||
O2NET_ERR_NONE = 0,
|
||||
O2NET_ERR_NO_HNDLR,
|
||||
O2NET_ERR_OVERFLOW,
|
||||
O2NET_ERR_DIED,
|
||||
O2NET_ERR_MAX
|
||||
};
|
||||
|
||||
struct o2net_status_wait {
|
||||
enum o2net_system_error ns_sys_status;
|
||||
s32 ns_status;
|
||||
int ns_id;
|
||||
wait_queue_head_t ns_wq;
|
||||
struct list_head ns_node_item;
|
||||
};
|
||||
|
||||
#endif /* O2CLUSTER_TCP_INTERNAL_H */
|
42
fs/ocfs2/cluster/ver.c
Normal file
42
fs/ocfs2/cluster/ver.c
Normal file
@ -0,0 +1,42 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ver.c
|
||||
*
|
||||
* version string
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "ver.h"
|
||||
|
||||
#define CLUSTER_BUILD_VERSION "1.3.3"
|
||||
|
||||
#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
|
||||
|
||||
void cluster_print_version(void)
|
||||
{
|
||||
printk(KERN_INFO "%s\n", VERSION_STR);
|
||||
}
|
||||
|
||||
MODULE_DESCRIPTION(VERSION_STR);
|
||||
|
||||
MODULE_VERSION(CLUSTER_BUILD_VERSION);
|
31
fs/ocfs2/cluster/ver.h
Normal file
31
fs/ocfs2/cluster/ver.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ver.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef O2CLUSTER_VER_H
|
||||
#define O2CLUSTER_VER_H
|
||||
|
||||
void cluster_print_version(void);
|
||||
|
||||
#endif /* O2CLUSTER_VER_H */
|
91
fs/ocfs2/dcache.c
Normal file
91
fs/ocfs2/dcache.c
Normal file
@ -0,0 +1,91 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dcache.c
|
||||
*
|
||||
* dentry cache handling code
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DCACHE
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "dcache.h"
|
||||
#include "file.h"
|
||||
#include "inode.h"
|
||||
|
||||
static int ocfs2_dentry_revalidate(struct dentry *dentry,
|
||||
struct nameidata *nd)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
int ret = 0; /* if all else fails, just return false */
|
||||
struct ocfs2_super *osb;
|
||||
|
||||
mlog_entry("(0x%p, '%.*s')\n", dentry,
|
||||
dentry->d_name.len, dentry->d_name.name);
|
||||
|
||||
/* Never trust a negative dentry - force a new lookup. */
|
||||
if (inode == NULL) {
|
||||
mlog(0, "negative dentry: %.*s\n", dentry->d_name.len,
|
||||
dentry->d_name.name);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
osb = OCFS2_SB(inode->i_sb);
|
||||
|
||||
BUG_ON(!osb);
|
||||
|
||||
if (inode != osb->root_inode) {
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
/* did we or someone else delete this inode? */
|
||||
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
mlog(0, "inode (%"MLFu64") deleted, returning false\n",
|
||||
OCFS2_I(inode)->ip_blkno);
|
||||
goto bail;
|
||||
}
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
if (!inode->i_nlink) {
|
||||
mlog(0, "Inode %"MLFu64" orphaned, returning false "
|
||||
"dir = %d\n", OCFS2_I(inode)->ip_blkno,
|
||||
S_ISDIR(inode->i_mode));
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 1;
|
||||
|
||||
bail:
|
||||
mlog_exit(ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct dentry_operations ocfs2_dentry_ops = {
|
||||
.d_revalidate = ocfs2_dentry_revalidate,
|
||||
};
|
31
fs/ocfs2/dcache.h
Normal file
31
fs/ocfs2/dcache.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dcache.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_DCACHE_H
|
||||
#define OCFS2_DCACHE_H
|
||||
|
||||
extern struct dentry_operations ocfs2_dentry_ops;
|
||||
|
||||
#endif /* OCFS2_DCACHE_H */
|
618
fs/ocfs2/dir.c
Normal file
618
fs/ocfs2/dir.c
Normal file
@ -0,0 +1,618 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dir.c
|
||||
*
|
||||
* Creates, reads, walks and deletes directory-nodes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* Portions of this code from linux/fs/ext3/dir.c
|
||||
*
|
||||
* Copyright (C) 1992, 1993, 1994, 1995
|
||||
* Remy Card (card@masi.ibp.fr)
|
||||
* Laboratoire MASI - Institut Blaise pascal
|
||||
* Universite Pierre et Marie Curie (Paris VI)
|
||||
*
|
||||
* from
|
||||
*
|
||||
* linux/fs/minix/dir.c
|
||||
*
|
||||
* Copyright (C) 1991, 1992 Linux Torvalds
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_NAMEI
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "dir.h"
|
||||
#include "dlmglue.h"
|
||||
#include "extent_map.h"
|
||||
#include "file.h"
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "namei.h"
|
||||
#include "suballoc.h"
|
||||
#include "uptodate.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
static unsigned char ocfs2_filetype_table[] = {
|
||||
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
|
||||
};
|
||||
|
||||
static int ocfs2_extend_dir(struct ocfs2_super *osb,
|
||||
struct inode *dir,
|
||||
struct buffer_head *parent_fe_bh,
|
||||
struct buffer_head **new_de_bh);
|
||||
/*
|
||||
* ocfs2_readdir()
|
||||
*
|
||||
*/
|
||||
int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
|
||||
{
|
||||
int error = 0;
|
||||
unsigned long offset, blk;
|
||||
int i, num, stored;
|
||||
struct buffer_head * bh, * tmp;
|
||||
struct ocfs2_dir_entry * de;
|
||||
int err;
|
||||
struct inode *inode = filp->f_dentry->d_inode;
|
||||
struct super_block * sb = inode->i_sb;
|
||||
int have_disk_lock = 0;
|
||||
|
||||
mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno);
|
||||
|
||||
stored = 0;
|
||||
bh = NULL;
|
||||
|
||||
error = ocfs2_meta_lock(inode, NULL, NULL, 0);
|
||||
if (error < 0) {
|
||||
if (error != -ENOENT)
|
||||
mlog_errno(error);
|
||||
/* we haven't got any yet, so propagate the error. */
|
||||
stored = error;
|
||||
goto bail;
|
||||
}
|
||||
have_disk_lock = 1;
|
||||
|
||||
offset = filp->f_pos & (sb->s_blocksize - 1);
|
||||
|
||||
while (!error && !stored && filp->f_pos < i_size_read(inode)) {
|
||||
blk = (filp->f_pos) >> sb->s_blocksize_bits;
|
||||
bh = ocfs2_bread(inode, blk, &err, 0);
|
||||
if (!bh) {
|
||||
mlog(ML_ERROR, "directory #%"MLFu64" contains a hole "
|
||||
"at offset %lld\n",
|
||||
OCFS2_I(inode)->ip_blkno,
|
||||
filp->f_pos);
|
||||
filp->f_pos += sb->s_blocksize - offset;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do the readahead (8k)
|
||||
*/
|
||||
if (!offset) {
|
||||
for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
|
||||
i > 0; i--) {
|
||||
tmp = ocfs2_bread(inode, ++blk, &err, 1);
|
||||
if (tmp)
|
||||
brelse(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
revalidate:
|
||||
/* If the dir block has changed since the last call to
|
||||
* readdir(2), then we might be pointing to an invalid
|
||||
* dirent right now. Scan from the start of the block
|
||||
* to make sure. */
|
||||
if (filp->f_version != inode->i_version) {
|
||||
for (i = 0; i < sb->s_blocksize && i < offset; ) {
|
||||
de = (struct ocfs2_dir_entry *) (bh->b_data + i);
|
||||
/* It's too expensive to do a full
|
||||
* dirent test each time round this
|
||||
* loop, but we do have to test at
|
||||
* least that it is non-zero. A
|
||||
* failure will be detected in the
|
||||
* dirent test below. */
|
||||
if (le16_to_cpu(de->rec_len) <
|
||||
OCFS2_DIR_REC_LEN(1))
|
||||
break;
|
||||
i += le16_to_cpu(de->rec_len);
|
||||
}
|
||||
offset = i;
|
||||
filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
|
||||
| offset;
|
||||
filp->f_version = inode->i_version;
|
||||
}
|
||||
|
||||
while (!error && filp->f_pos < i_size_read(inode)
|
||||
&& offset < sb->s_blocksize) {
|
||||
de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
|
||||
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
|
||||
/* On error, skip the f_pos to the
|
||||
next block. */
|
||||
filp->f_pos = (filp->f_pos |
|
||||
(sb->s_blocksize - 1)) + 1;
|
||||
brelse(bh);
|
||||
goto bail;
|
||||
}
|
||||
offset += le16_to_cpu(de->rec_len);
|
||||
if (le64_to_cpu(de->inode)) {
|
||||
/* We might block in the next section
|
||||
* if the data destination is
|
||||
* currently swapped out. So, use a
|
||||
* version stamp to detect whether or
|
||||
* not the directory has been modified
|
||||
* during the copy operation.
|
||||
*/
|
||||
unsigned long version = filp->f_version;
|
||||
unsigned char d_type = DT_UNKNOWN;
|
||||
|
||||
if (de->file_type < OCFS2_FT_MAX)
|
||||
d_type = ocfs2_filetype_table[de->file_type];
|
||||
error = filldir(dirent, de->name,
|
||||
de->name_len,
|
||||
filp->f_pos,
|
||||
ino_from_blkno(sb, le64_to_cpu(de->inode)),
|
||||
d_type);
|
||||
if (error)
|
||||
break;
|
||||
if (version != filp->f_version)
|
||||
goto revalidate;
|
||||
stored ++;
|
||||
}
|
||||
filp->f_pos += le16_to_cpu(de->rec_len);
|
||||
}
|
||||
offset = 0;
|
||||
brelse(bh);
|
||||
}
|
||||
|
||||
stored = 0;
|
||||
bail:
|
||||
if (have_disk_lock)
|
||||
ocfs2_meta_unlock(inode, 0);
|
||||
|
||||
mlog_exit(stored);
|
||||
|
||||
return stored;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: this should always be called with parent dir i_sem taken.
|
||||
*/
|
||||
int ocfs2_find_files_on_disk(const char *name,
|
||||
int namelen,
|
||||
u64 *blkno,
|
||||
struct inode *inode,
|
||||
struct buffer_head **dirent_bh,
|
||||
struct ocfs2_dir_entry **dirent)
|
||||
{
|
||||
int status = -ENOENT;
|
||||
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
||||
|
||||
mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, "
|
||||
"inode=%p)\n",
|
||||
osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode);
|
||||
|
||||
*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
|
||||
if (!*dirent_bh || !*dirent) {
|
||||
status = -ENOENT;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
*blkno = le64_to_cpu((*dirent)->inode);
|
||||
|
||||
status = 0;
|
||||
leave:
|
||||
if (status < 0) {
|
||||
*dirent = NULL;
|
||||
if (*dirent_bh) {
|
||||
brelse(*dirent_bh);
|
||||
*dirent_bh = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/* Check for a name within a directory.
|
||||
*
|
||||
* Return 0 if the name does not exist
|
||||
* Return -EEXIST if the directory contains the name
|
||||
*
|
||||
* Callers should have i_sem + a cluster lock on dir
|
||||
*/
|
||||
int ocfs2_check_dir_for_entry(struct inode *dir,
|
||||
const char *name,
|
||||
int namelen)
|
||||
{
|
||||
int ret;
|
||||
struct buffer_head *dirent_bh = NULL;
|
||||
struct ocfs2_dir_entry *dirent = NULL;
|
||||
|
||||
mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno,
|
||||
namelen, name);
|
||||
|
||||
ret = -EEXIST;
|
||||
dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent);
|
||||
if (dirent_bh)
|
||||
goto bail;
|
||||
|
||||
ret = 0;
|
||||
bail:
|
||||
if (dirent_bh)
|
||||
brelse(dirent_bh);
|
||||
|
||||
mlog_exit(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* routine to check that the specified directory is empty (for rmdir)
|
||||
*/
|
||||
int ocfs2_empty_dir(struct inode *inode)
|
||||
{
|
||||
unsigned long offset;
|
||||
struct buffer_head * bh;
|
||||
struct ocfs2_dir_entry * de, * de1;
|
||||
struct super_block * sb;
|
||||
int err;
|
||||
|
||||
sb = inode->i_sb;
|
||||
if ((i_size_read(inode) <
|
||||
(OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
|
||||
!(bh = ocfs2_bread(inode, 0, &err, 0))) {
|
||||
mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
|
||||
"no data block\n",
|
||||
OCFS2_I(inode)->ip_blkno);
|
||||
return 1;
|
||||
}
|
||||
|
||||
de = (struct ocfs2_dir_entry *) bh->b_data;
|
||||
de1 = (struct ocfs2_dir_entry *)
|
||||
((char *)de + le16_to_cpu(de->rec_len));
|
||||
if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
|
||||
!le64_to_cpu(de1->inode) ||
|
||||
strcmp(".", de->name) ||
|
||||
strcmp("..", de1->name)) {
|
||||
mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - "
|
||||
"no `.' or `..'\n",
|
||||
OCFS2_I(inode)->ip_blkno);
|
||||
brelse(bh);
|
||||
return 1;
|
||||
}
|
||||
offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
|
||||
de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
|
||||
while (offset < i_size_read(inode) ) {
|
||||
if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
|
||||
brelse(bh);
|
||||
bh = ocfs2_bread(inode,
|
||||
offset >> sb->s_blocksize_bits, &err, 0);
|
||||
if (!bh) {
|
||||
mlog(ML_ERROR, "directory #%"MLFu64" contains "
|
||||
"a hole at offset %lu\n",
|
||||
OCFS2_I(inode)->ip_blkno, offset);
|
||||
offset += sb->s_blocksize;
|
||||
continue;
|
||||
}
|
||||
de = (struct ocfs2_dir_entry *) bh->b_data;
|
||||
}
|
||||
if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
|
||||
brelse(bh);
|
||||
return 1;
|
||||
}
|
||||
if (le64_to_cpu(de->inode)) {
|
||||
brelse(bh);
|
||||
return 0;
|
||||
}
|
||||
offset += le16_to_cpu(de->rec_len);
|
||||
de = (struct ocfs2_dir_entry *)
|
||||
((char *)de + le16_to_cpu(de->rec_len));
|
||||
}
|
||||
brelse(bh);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* returns a bh of the 1st new block in the allocation. */
|
||||
int ocfs2_do_extend_dir(struct super_block *sb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct inode *dir,
|
||||
struct buffer_head *parent_fe_bh,
|
||||
struct ocfs2_alloc_context *data_ac,
|
||||
struct ocfs2_alloc_context *meta_ac,
|
||||
struct buffer_head **new_bh)
|
||||
{
|
||||
int status;
|
||||
int extend;
|
||||
u64 p_blkno;
|
||||
|
||||
spin_lock(&OCFS2_I(dir)->ip_lock);
|
||||
extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
|
||||
spin_unlock(&OCFS2_I(dir)->ip_lock);
|
||||
|
||||
if (extend) {
|
||||
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
|
||||
parent_fe_bh, handle,
|
||||
data_ac, meta_ac, NULL);
|
||||
BUG_ON(status == -EAGAIN);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
|
||||
(sb->s_blocksize_bits - 9)),
|
||||
1, &p_blkno, NULL);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
*new_bh = sb_getblk(sb, p_blkno);
|
||||
if (!*new_bh) {
|
||||
status = -EIO;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
status = 0;
|
||||
bail:
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/* assumes you already have a cluster lock on the directory. */
|
||||
static int ocfs2_extend_dir(struct ocfs2_super *osb,
|
||||
struct inode *dir,
|
||||
struct buffer_head *parent_fe_bh,
|
||||
struct buffer_head **new_de_bh)
|
||||
{
|
||||
int status = 0;
|
||||
int credits, num_free_extents;
|
||||
loff_t dir_i_size;
|
||||
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
|
||||
struct ocfs2_alloc_context *data_ac = NULL;
|
||||
struct ocfs2_alloc_context *meta_ac = NULL;
|
||||
struct ocfs2_journal_handle *handle = NULL;
|
||||
struct buffer_head *new_bh = NULL;
|
||||
struct ocfs2_dir_entry * de;
|
||||
struct super_block *sb = osb->sb;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
dir_i_size = i_size_read(dir);
|
||||
mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n",
|
||||
OCFS2_I(dir)->ip_blkno, dir_i_size);
|
||||
|
||||
handle = ocfs2_alloc_handle(osb);
|
||||
if (handle == NULL) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* dir->i_size is always block aligned. */
|
||||
spin_lock(&OCFS2_I(dir)->ip_lock);
|
||||
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
|
||||
spin_unlock(&OCFS2_I(dir)->ip_lock);
|
||||
num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
|
||||
if (num_free_extents < 0) {
|
||||
status = num_free_extents;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (!num_free_extents) {
|
||||
status = ocfs2_reserve_new_metadata(osb, handle,
|
||||
fe, &meta_ac);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
credits = ocfs2_calc_extend_credits(sb, fe, 1);
|
||||
} else {
|
||||
spin_unlock(&OCFS2_I(dir)->ip_lock);
|
||||
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
|
||||
}
|
||||
|
||||
handle = ocfs2_start_trans(osb, handle, credits);
|
||||
if (IS_ERR(handle)) {
|
||||
status = PTR_ERR(handle);
|
||||
handle = NULL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh,
|
||||
data_ac, meta_ac, &new_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ocfs2_set_new_buffer_uptodate(dir, new_bh);
|
||||
|
||||
status = ocfs2_journal_access(handle, dir, new_bh,
|
||||
OCFS2_JOURNAL_ACCESS_CREATE);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
memset(new_bh->b_data, 0, sb->s_blocksize);
|
||||
de = (struct ocfs2_dir_entry *) new_bh->b_data;
|
||||
de->inode = 0;
|
||||
de->rec_len = cpu_to_le16(sb->s_blocksize);
|
||||
status = ocfs2_journal_dirty(handle, new_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
dir_i_size += dir->i_sb->s_blocksize;
|
||||
i_size_write(dir, dir_i_size);
|
||||
dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
|
||||
status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
*new_de_bh = new_bh;
|
||||
get_bh(*new_de_bh);
|
||||
bail:
|
||||
if (handle)
|
||||
ocfs2_commit_trans(handle);
|
||||
|
||||
if (data_ac)
|
||||
ocfs2_free_alloc_context(data_ac);
|
||||
if (meta_ac)
|
||||
ocfs2_free_alloc_context(meta_ac);
|
||||
|
||||
if (new_bh)
|
||||
brelse(new_bh);
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search the dir for a good spot, extending it if necessary. The
|
||||
* block containing an appropriate record is returned in ret_de_bh.
|
||||
*/
|
||||
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
|
||||
struct inode *dir,
|
||||
struct buffer_head *parent_fe_bh,
|
||||
const char *name,
|
||||
int namelen,
|
||||
struct buffer_head **ret_de_bh)
|
||||
{
|
||||
unsigned long offset;
|
||||
struct buffer_head * bh = NULL;
|
||||
unsigned short rec_len;
|
||||
struct ocfs2_dinode *fe;
|
||||
struct ocfs2_dir_entry *de;
|
||||
struct super_block *sb;
|
||||
int status;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n",
|
||||
namelen, OCFS2_I(dir)->ip_blkno);
|
||||
|
||||
BUG_ON(!S_ISDIR(dir->i_mode));
|
||||
fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
|
||||
BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
|
||||
|
||||
sb = dir->i_sb;
|
||||
|
||||
if (!namelen) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bh = ocfs2_bread(dir, 0, &status, 0);
|
||||
if (!bh) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
rec_len = OCFS2_DIR_REC_LEN(namelen);
|
||||
offset = 0;
|
||||
de = (struct ocfs2_dir_entry *) bh->b_data;
|
||||
while (1) {
|
||||
if ((char *)de >= sb->s_blocksize + bh->b_data) {
|
||||
brelse(bh);
|
||||
bh = NULL;
|
||||
|
||||
if (i_size_read(dir) <= offset) {
|
||||
status = ocfs2_extend_dir(osb,
|
||||
dir,
|
||||
parent_fe_bh,
|
||||
&bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
BUG_ON(!bh);
|
||||
*ret_de_bh = bh;
|
||||
get_bh(*ret_de_bh);
|
||||
goto bail;
|
||||
}
|
||||
bh = ocfs2_bread(dir,
|
||||
offset >> sb->s_blocksize_bits,
|
||||
&status,
|
||||
0);
|
||||
if (!bh) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
/* move to next block */
|
||||
de = (struct ocfs2_dir_entry *) bh->b_data;
|
||||
}
|
||||
if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
|
||||
status = -ENOENT;
|
||||
goto bail;
|
||||
}
|
||||
if (ocfs2_match(namelen, name, de)) {
|
||||
status = -EEXIST;
|
||||
goto bail;
|
||||
}
|
||||
if (((le64_to_cpu(de->inode) == 0) &&
|
||||
(le16_to_cpu(de->rec_len) >= rec_len)) ||
|
||||
(le16_to_cpu(de->rec_len) >=
|
||||
(OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
|
||||
/* Ok, we found a spot. Return this bh and let
|
||||
* the caller actually fill it in. */
|
||||
*ret_de_bh = bh;
|
||||
get_bh(*ret_de_bh);
|
||||
status = 0;
|
||||
goto bail;
|
||||
}
|
||||
offset += le16_to_cpu(de->rec_len);
|
||||
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
|
||||
}
|
||||
|
||||
status = 0;
|
||||
bail:
|
||||
if (bh)
|
||||
brelse(bh);
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
54
fs/ocfs2/dir.h
Normal file
54
fs/ocfs2/dir.h
Normal file
@ -0,0 +1,54 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dir.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_DIR_H
|
||||
#define OCFS2_DIR_H
|
||||
|
||||
int ocfs2_check_dir_for_entry(struct inode *dir,
|
||||
const char *name,
|
||||
int namelen);
|
||||
int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */
|
||||
int ocfs2_find_files_on_disk(const char *name,
|
||||
int namelen,
|
||||
u64 *blkno,
|
||||
struct inode *inode,
|
||||
struct buffer_head **dirent_bh,
|
||||
struct ocfs2_dir_entry **dirent);
|
||||
int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
|
||||
int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
|
||||
struct inode *dir,
|
||||
struct buffer_head *parent_fe_bh,
|
||||
const char *name,
|
||||
int namelen,
|
||||
struct buffer_head **ret_de_bh);
|
||||
struct ocfs2_alloc_context;
|
||||
int ocfs2_do_extend_dir(struct super_block *sb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct inode *dir,
|
||||
struct buffer_head *parent_fe_bh,
|
||||
struct ocfs2_alloc_context *data_ac,
|
||||
struct ocfs2_alloc_context *meta_ac,
|
||||
struct buffer_head **new_bh);
|
||||
#endif /* OCFS2_DIR_H */
|
8
fs/ocfs2/dlm/Makefile
Normal file
8
fs/ocfs2/dlm/Makefile
Normal file
@ -0,0 +1,8 @@
|
||||
EXTRA_CFLAGS += -Ifs/ocfs2
|
||||
|
||||
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o
|
||||
|
||||
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
|
||||
dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
|
||||
|
||||
ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
|
214
fs/ocfs2/dlm/dlmapi.h
Normal file
214
fs/ocfs2/dlm/dlmapi.h
Normal file
@ -0,0 +1,214 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmapi.h
|
||||
*
|
||||
* externally exported dlm interfaces
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef DLMAPI_H
|
||||
#define DLMAPI_H
|
||||
|
||||
struct dlm_lock;
|
||||
struct dlm_ctxt;
|
||||
|
||||
/* NOTE: changes made to this enum should be reflected in dlmdebug.c */
|
||||
enum dlm_status {
|
||||
DLM_NORMAL = 0, /* 0: request in progress */
|
||||
DLM_GRANTED, /* 1: request granted */
|
||||
DLM_DENIED, /* 2: request denied */
|
||||
DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */
|
||||
DLM_WORKING, /* 4: async request in progress */
|
||||
DLM_BLOCKED, /* 5: lock request blocked */
|
||||
DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/
|
||||
DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */
|
||||
DLM_SYSERR, /* 8: system error */
|
||||
DLM_NOSUPPORT, /* 9: unsupported */
|
||||
DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */
|
||||
DLM_IVLOCKID, /* 11: bad lockid */
|
||||
DLM_SYNC, /* 12: synchronous request granted */
|
||||
DLM_BADTYPE, /* 13: bad resource type */
|
||||
DLM_BADRESOURCE, /* 14: bad resource handle */
|
||||
DLM_MAXHANDLES, /* 15: no more resource handles */
|
||||
DLM_NOCLINFO, /* 16: can't contact cluster manager */
|
||||
DLM_NOLOCKMGR, /* 17: can't contact lock manager */
|
||||
DLM_NOPURGED, /* 18: can't contact purge daemon */
|
||||
DLM_BADARGS, /* 19: bad api args */
|
||||
DLM_VOID, /* 20: no status */
|
||||
DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */
|
||||
DLM_IVBUFLEN, /* 22: invalid resource name length */
|
||||
DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */
|
||||
DLM_BADPARAM, /* 24: invalid lock mode specified */
|
||||
DLM_VALNOTVALID, /* 25: value block has been invalidated */
|
||||
DLM_REJECTED, /* 26: request rejected, unrecognized client */
|
||||
DLM_ABORT, /* 27: blocked lock request cancelled */
|
||||
DLM_CANCEL, /* 28: conversion request cancelled */
|
||||
DLM_IVRESHANDLE, /* 29: invalid resource handle */
|
||||
DLM_DEADLOCK, /* 30: deadlock recovery refused this request */
|
||||
DLM_DENIED_NOASTS, /* 31: failed to allocate AST */
|
||||
DLM_FORWARD, /* 32: request must wait for primary's response */
|
||||
DLM_TIMEOUT, /* 33: timeout value for lock has expired */
|
||||
DLM_IVGROUPID, /* 34: invalid group specification */
|
||||
DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */
|
||||
DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */
|
||||
DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */
|
||||
DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */
|
||||
|
||||
DLM_RECOVERING, /* 39: extension, allows caller to fail a lock
|
||||
request if it is being recovered */
|
||||
DLM_MIGRATING, /* 40: extension, allows caller to fail a lock
|
||||
request if it is being migrated */
|
||||
DLM_MAXSTATS, /* 41: upper limit for return code validation */
|
||||
};
|
||||
|
||||
/* for pretty-printing dlm_status error messages */
|
||||
const char *dlm_errmsg(enum dlm_status err);
|
||||
/* for pretty-printing dlm_status error names */
|
||||
const char *dlm_errname(enum dlm_status err);
|
||||
|
||||
/* Eventually the DLM will use standard errno values, but in the
|
||||
* meantime this lets us track dlm errors as they bubble up. When we
|
||||
* bring its error reporting into line with the rest of the stack,
|
||||
* these can just be replaced with calls to mlog_errno. */
|
||||
#define dlm_error(st) do { \
|
||||
if ((st) != DLM_RECOVERING && \
|
||||
(st) != DLM_MIGRATING && \
|
||||
(st) != DLM_FORWARD) \
|
||||
mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \
|
||||
} while (0)
|
||||
|
||||
#define DLM_LKSB_UNUSED1 0x01
|
||||
#define DLM_LKSB_PUT_LVB 0x02
|
||||
#define DLM_LKSB_GET_LVB 0x04
|
||||
#define DLM_LKSB_UNUSED2 0x08
|
||||
#define DLM_LKSB_UNUSED3 0x10
|
||||
#define DLM_LKSB_UNUSED4 0x20
|
||||
#define DLM_LKSB_UNUSED5 0x40
|
||||
#define DLM_LKSB_UNUSED6 0x80
|
||||
|
||||
#define DLM_LVB_LEN 64
|
||||
|
||||
/* Callers are only allowed access to the lvb and status members of
|
||||
* this struct. */
|
||||
struct dlm_lockstatus {
|
||||
enum dlm_status status;
|
||||
u32 flags;
|
||||
struct dlm_lock *lockid;
|
||||
char lvb[DLM_LVB_LEN];
|
||||
};
|
||||
|
||||
/* Valid lock modes. */
|
||||
#define LKM_IVMODE (-1) /* invalid mode */
|
||||
#define LKM_NLMODE 0 /* null lock */
|
||||
#define LKM_CRMODE 1 /* concurrent read unsupported */
|
||||
#define LKM_CWMODE 2 /* concurrent write unsupported */
|
||||
#define LKM_PRMODE 3 /* protected read */
|
||||
#define LKM_PWMODE 4 /* protected write unsupported */
|
||||
#define LKM_EXMODE 5 /* exclusive */
|
||||
#define LKM_MAXMODE 5
|
||||
#define LKM_MODEMASK 0xff
|
||||
|
||||
/* Flags passed to dlmlock and dlmunlock:
|
||||
* reserved: flags used by the "real" dlm
|
||||
* only a few are supported by this dlm
|
||||
* (U) = unsupported by ocfs2 dlm */
|
||||
#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */
|
||||
#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */
|
||||
#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */
|
||||
#define LKM_LOCAL 0x00000080 /* local lock request */
|
||||
#define LKM_VALBLK 0x00000100 /* lock value block request */
|
||||
#define LKM_NOQUEUE 0x00000200 /* non blocking request */
|
||||
#define LKM_CONVERT 0x00000400 /* conversion request */
|
||||
#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */
|
||||
#define LKM_UNLOCK 0x00001000 /* deallocate this lock */
|
||||
#define LKM_CANCEL 0x00002000 /* cancel conversion request */
|
||||
#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */
|
||||
#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */
|
||||
#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */
|
||||
#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */
|
||||
#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */
|
||||
#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */
|
||||
#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */
|
||||
#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */
|
||||
#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */
|
||||
#define LKM_FORCE 0x00800000 /* force unlock flag */
|
||||
#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate
|
||||
lock value block (U) */
|
||||
/* unused */
|
||||
#define LKM_UNUSED1 0x00000001 /* unused */
|
||||
#define LKM_UNUSED2 0x00000002 /* unused */
|
||||
#define LKM_UNUSED3 0x00000004 /* unused */
|
||||
#define LKM_UNUSED4 0x00000008 /* unused */
|
||||
#define LKM_UNUSED5 0x02000000 /* unused */
|
||||
#define LKM_UNUSED6 0x04000000 /* unused */
|
||||
#define LKM_UNUSED7 0x08000000 /* unused */
|
||||
|
||||
/* ocfs2 extensions: internal only
|
||||
* should never be used by caller */
|
||||
#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated
|
||||
to another node */
|
||||
#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed
|
||||
should be applied to lockres */
|
||||
#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied
|
||||
from lockres when lock is granted */
|
||||
#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock
|
||||
used to avoid recovery rwsem */
|
||||
|
||||
|
||||
typedef void (dlm_astlockfunc_t)(void *);
|
||||
typedef void (dlm_bastlockfunc_t)(void *, int);
|
||||
typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status);
|
||||
|
||||
enum dlm_status dlmlock(struct dlm_ctxt *dlm,
|
||||
int mode,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags,
|
||||
const char *name,
|
||||
dlm_astlockfunc_t *ast,
|
||||
void *data,
|
||||
dlm_bastlockfunc_t *bast);
|
||||
|
||||
enum dlm_status dlmunlock(struct dlm_ctxt *dlm,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags,
|
||||
dlm_astunlockfunc_t *unlockast,
|
||||
void *data);
|
||||
|
||||
struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key);
|
||||
|
||||
void dlm_unregister_domain(struct dlm_ctxt *dlm);
|
||||
|
||||
void dlm_print_one_lock(struct dlm_lock *lockid);
|
||||
|
||||
typedef void (dlm_eviction_func)(int, void *);
|
||||
struct dlm_eviction_cb {
|
||||
struct list_head ec_item;
|
||||
dlm_eviction_func *ec_func;
|
||||
void *ec_data;
|
||||
};
|
||||
void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb,
|
||||
dlm_eviction_func *f,
|
||||
void *data);
|
||||
void dlm_register_eviction_cb(struct dlm_ctxt *dlm,
|
||||
struct dlm_eviction_cb *cb);
|
||||
void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb);
|
||||
|
||||
#endif /* DLMAPI_H */
|
466
fs/ocfs2/dlm/dlmast.c
Normal file
466
fs/ocfs2/dlm/dlmast.c
Normal file
@ -0,0 +1,466 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmast.c
|
||||
*
|
||||
* AST and BAST functionality for local and remote nodes
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/tcp.h"
|
||||
#include "cluster/endian.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
#include "dlmcommon.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLM
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
|
||||
|
||||
/* Should be called as an ast gets queued to see if the new
|
||||
* lock level will obsolete a pending bast.
|
||||
* For example, if dlm_thread queued a bast for an EX lock that
|
||||
* was blocking another EX, but before sending the bast the
|
||||
* lock owner downconverted to NL, the bast is now obsolete.
|
||||
* Only the ast should be sent.
|
||||
* This is needed because the lock and convert paths can queue
|
||||
* asts out-of-band (not waiting for dlm_thread) in order to
|
||||
* allow for LKM_NOQUEUE to get immediate responses. */
|
||||
static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
|
||||
{
|
||||
assert_spin_locked(&dlm->ast_lock);
|
||||
assert_spin_locked(&lock->spinlock);
|
||||
|
||||
if (lock->ml.highest_blocked == LKM_IVMODE)
|
||||
return 0;
|
||||
BUG_ON(lock->ml.highest_blocked == LKM_NLMODE);
|
||||
|
||||
if (lock->bast_pending &&
|
||||
list_empty(&lock->bast_list))
|
||||
/* old bast already sent, ok */
|
||||
return 0;
|
||||
|
||||
if (lock->ml.type == LKM_EXMODE)
|
||||
/* EX blocks anything left, any bast still valid */
|
||||
return 0;
|
||||
else if (lock->ml.type == LKM_NLMODE)
|
||||
/* NL blocks nothing, no reason to send any bast, cancel it */
|
||||
return 1;
|
||||
else if (lock->ml.highest_blocked != LKM_EXMODE)
|
||||
/* PR only blocks EX */
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
|
||||
{
|
||||
mlog_entry_void();
|
||||
|
||||
BUG_ON(!dlm);
|
||||
BUG_ON(!lock);
|
||||
|
||||
assert_spin_locked(&dlm->ast_lock);
|
||||
if (!list_empty(&lock->ast_list)) {
|
||||
mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n",
|
||||
lock->ast_pending, lock->ml.type);
|
||||
BUG();
|
||||
}
|
||||
BUG_ON(!list_empty(&lock->ast_list));
|
||||
if (lock->ast_pending)
|
||||
mlog(0, "lock has an ast getting flushed right now\n");
|
||||
|
||||
/* putting lock on list, add a ref */
|
||||
dlm_lock_get(lock);
|
||||
spin_lock(&lock->spinlock);
|
||||
|
||||
/* check to see if this ast obsoletes the bast */
|
||||
if (dlm_should_cancel_bast(dlm, lock)) {
|
||||
struct dlm_lock_resource *res = lock->lockres;
|
||||
mlog(0, "%s: cancelling bast for %.*s\n",
|
||||
dlm->name, res->lockname.len, res->lockname.name);
|
||||
lock->bast_pending = 0;
|
||||
list_del_init(&lock->bast_list);
|
||||
lock->ml.highest_blocked = LKM_IVMODE;
|
||||
/* removing lock from list, remove a ref. guaranteed
|
||||
* this won't be the last ref because of the get above,
|
||||
* so res->spinlock will not be taken here */
|
||||
dlm_lock_put(lock);
|
||||
/* free up the reserved bast that we are cancelling.
|
||||
* guaranteed that this will not be the last reserved
|
||||
* ast because *both* an ast and a bast were reserved
|
||||
* to get to this point. the res->spinlock will not be
|
||||
* taken here */
|
||||
dlm_lockres_release_ast(dlm, res);
|
||||
}
|
||||
list_add_tail(&lock->ast_list, &dlm->pending_asts);
|
||||
lock->ast_pending = 1;
|
||||
spin_unlock(&lock->spinlock);
|
||||
}
|
||||
|
||||
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
|
||||
{
|
||||
mlog_entry_void();
|
||||
|
||||
BUG_ON(!dlm);
|
||||
BUG_ON(!lock);
|
||||
|
||||
spin_lock(&dlm->ast_lock);
|
||||
__dlm_queue_ast(dlm, lock);
|
||||
spin_unlock(&dlm->ast_lock);
|
||||
}
|
||||
|
||||
|
||||
static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
|
||||
{
|
||||
mlog_entry_void();
|
||||
|
||||
BUG_ON(!dlm);
|
||||
BUG_ON(!lock);
|
||||
assert_spin_locked(&dlm->ast_lock);
|
||||
|
||||
BUG_ON(!list_empty(&lock->bast_list));
|
||||
if (lock->bast_pending)
|
||||
mlog(0, "lock has a bast getting flushed right now\n");
|
||||
|
||||
/* putting lock on list, add a ref */
|
||||
dlm_lock_get(lock);
|
||||
spin_lock(&lock->spinlock);
|
||||
list_add_tail(&lock->bast_list, &dlm->pending_basts);
|
||||
lock->bast_pending = 1;
|
||||
spin_unlock(&lock->spinlock);
|
||||
}
|
||||
|
||||
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
|
||||
{
|
||||
mlog_entry_void();
|
||||
|
||||
BUG_ON(!dlm);
|
||||
BUG_ON(!lock);
|
||||
|
||||
spin_lock(&dlm->ast_lock);
|
||||
__dlm_queue_bast(dlm, lock);
|
||||
spin_unlock(&dlm->ast_lock);
|
||||
}
|
||||
|
||||
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
struct dlm_lockstatus *lksb = lock->lksb;
|
||||
BUG_ON(!lksb);
|
||||
|
||||
/* only updates if this node masters the lockres */
|
||||
if (res->owner == dlm->node_num) {
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
/* check the lksb flags for the direction */
|
||||
if (lksb->flags & DLM_LKSB_GET_LVB) {
|
||||
mlog(0, "getting lvb from lockres for %s node\n",
|
||||
lock->ml.node == dlm->node_num ? "master" :
|
||||
"remote");
|
||||
memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
|
||||
} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
|
||||
mlog(0, "setting lvb from lockres for %s node\n",
|
||||
lock->ml.node == dlm->node_num ? "master" :
|
||||
"remote");
|
||||
memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
}
|
||||
|
||||
/* reset any lvb flags on the lksb */
|
||||
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
|
||||
}
|
||||
|
||||
void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
dlm_astlockfunc_t *fn;
|
||||
struct dlm_lockstatus *lksb;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
lksb = lock->lksb;
|
||||
fn = lock->ast;
|
||||
BUG_ON(lock->ml.node != dlm->node_num);
|
||||
|
||||
dlm_update_lvb(dlm, res, lock);
|
||||
(*fn)(lock->astdata);
|
||||
}
|
||||
|
||||
|
||||
int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
int ret;
|
||||
struct dlm_lockstatus *lksb;
|
||||
int lksbflags;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
lksb = lock->lksb;
|
||||
BUG_ON(lock->ml.node == dlm->node_num);
|
||||
|
||||
lksbflags = lksb->flags;
|
||||
dlm_update_lvb(dlm, res, lock);
|
||||
|
||||
/* lock request came from another node
|
||||
* go do the ast over there */
|
||||
ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int blocked_type)
|
||||
{
|
||||
dlm_bastlockfunc_t *fn = lock->bast;
|
||||
|
||||
mlog_entry_void();
|
||||
BUG_ON(lock->ml.node != dlm->node_num);
|
||||
|
||||
(*fn)(lock->astdata, blocked_type);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
|
||||
{
|
||||
int ret;
|
||||
unsigned int locklen;
|
||||
struct dlm_ctxt *dlm = data;
|
||||
struct dlm_lock_resource *res = NULL;
|
||||
struct dlm_lock *lock = NULL;
|
||||
struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
|
||||
char *name;
|
||||
struct list_head *iter, *head=NULL;
|
||||
u64 cookie;
|
||||
u32 flags;
|
||||
|
||||
if (!dlm_grab(dlm)) {
|
||||
dlm_error(DLM_REJECTED);
|
||||
return DLM_REJECTED;
|
||||
}
|
||||
|
||||
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
|
||||
"Domain %s not fully joined!\n", dlm->name);
|
||||
|
||||
name = past->name;
|
||||
locklen = past->namelen;
|
||||
cookie = be64_to_cpu(past->cookie);
|
||||
flags = be32_to_cpu(past->flags);
|
||||
|
||||
if (locklen > DLM_LOCKID_NAME_MAX) {
|
||||
ret = DLM_IVBUFLEN;
|
||||
mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
|
||||
goto leave;
|
||||
}
|
||||
|
||||
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
|
||||
(LKM_PUT_LVB|LKM_GET_LVB)) {
|
||||
mlog(ML_ERROR, "both PUT and GET lvb specified\n");
|
||||
ret = DLM_BADARGS;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
|
||||
(flags & LKM_GET_LVB ? "get lvb" : "none"));
|
||||
|
||||
mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type);
|
||||
|
||||
if (past->type != DLM_AST &&
|
||||
past->type != DLM_BAST) {
|
||||
mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", "
|
||||
"name=%.*s\n", past->type, cookie, locklen, name);
|
||||
ret = DLM_IVLOCKID;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
res = dlm_lookup_lockres(dlm, name, locklen);
|
||||
if (!res) {
|
||||
mlog(ML_ERROR, "got %sast for unknown lockres! "
|
||||
"cookie=%"MLFu64", name=%.*s, namelen=%u\n",
|
||||
past->type == DLM_AST ? "" : "b",
|
||||
cookie, locklen, name, locklen);
|
||||
ret = DLM_IVLOCKID;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
/* cannot get a proxy ast message if this node owns it */
|
||||
BUG_ON(res->owner == dlm->node_num);
|
||||
|
||||
mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
if (res->state & DLM_LOCK_RES_RECOVERING) {
|
||||
mlog(0, "responding with DLM_RECOVERING!\n");
|
||||
ret = DLM_RECOVERING;
|
||||
goto unlock_out;
|
||||
}
|
||||
if (res->state & DLM_LOCK_RES_MIGRATING) {
|
||||
mlog(0, "responding with DLM_MIGRATING!\n");
|
||||
ret = DLM_MIGRATING;
|
||||
goto unlock_out;
|
||||
}
|
||||
/* try convert queue for both ast/bast */
|
||||
head = &res->converting;
|
||||
lock = NULL;
|
||||
list_for_each(iter, head) {
|
||||
lock = list_entry (iter, struct dlm_lock, list);
|
||||
if (be64_to_cpu(lock->ml.cookie) == cookie)
|
||||
goto do_ast;
|
||||
}
|
||||
|
||||
/* if not on convert, try blocked for ast, granted for bast */
|
||||
if (past->type == DLM_AST)
|
||||
head = &res->blocked;
|
||||
else
|
||||
head = &res->granted;
|
||||
|
||||
list_for_each(iter, head) {
|
||||
lock = list_entry (iter, struct dlm_lock, list);
|
||||
if (be64_to_cpu(lock->ml.cookie) == cookie)
|
||||
goto do_ast;
|
||||
}
|
||||
|
||||
mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", "
|
||||
"name=%.*s, namelen=%u\n",
|
||||
past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen);
|
||||
|
||||
ret = DLM_NORMAL;
|
||||
unlock_out:
|
||||
spin_unlock(&res->spinlock);
|
||||
goto leave;
|
||||
|
||||
do_ast:
|
||||
ret = DLM_NORMAL;
|
||||
if (past->type == DLM_AST) {
|
||||
/* do not alter lock refcount. switching lists. */
|
||||
list_del_init(&lock->list);
|
||||
list_add_tail(&lock->list, &res->granted);
|
||||
mlog(0, "ast: adding to granted list... type=%d, "
|
||||
"convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
|
||||
if (lock->ml.convert_type != LKM_IVMODE) {
|
||||
lock->ml.type = lock->ml.convert_type;
|
||||
lock->ml.convert_type = LKM_IVMODE;
|
||||
} else {
|
||||
// should already be there....
|
||||
}
|
||||
|
||||
lock->lksb->status = DLM_NORMAL;
|
||||
|
||||
/* if we requested the lvb, fetch it into our lksb now */
|
||||
if (flags & LKM_GET_LVB) {
|
||||
BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB));
|
||||
memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN);
|
||||
}
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
if (past->type == DLM_AST)
|
||||
dlm_do_local_ast(dlm, res, lock);
|
||||
else
|
||||
dlm_do_local_bast(dlm, res, lock, past->blocked_type);
|
||||
|
||||
leave:
|
||||
|
||||
if (res)
|
||||
dlm_lockres_put(res);
|
||||
|
||||
dlm_put(dlm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int msg_type,
|
||||
int blocked_type, int flags)
|
||||
{
|
||||
int ret = 0;
|
||||
struct dlm_proxy_ast past;
|
||||
struct kvec vec[2];
|
||||
size_t veclen = 1;
|
||||
int status;
|
||||
|
||||
mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n",
|
||||
res->lockname.len, res->lockname.name, lock->ml.node,
|
||||
msg_type, blocked_type);
|
||||
|
||||
memset(&past, 0, sizeof(struct dlm_proxy_ast));
|
||||
past.node_idx = dlm->node_num;
|
||||
past.type = msg_type;
|
||||
past.blocked_type = blocked_type;
|
||||
past.namelen = res->lockname.len;
|
||||
memcpy(past.name, res->lockname.name, past.namelen);
|
||||
past.cookie = lock->ml.cookie;
|
||||
|
||||
vec[0].iov_len = sizeof(struct dlm_proxy_ast);
|
||||
vec[0].iov_base = &past;
|
||||
if (flags & DLM_LKSB_GET_LVB) {
|
||||
mlog(0, "returning requested LVB data\n");
|
||||
be32_add_cpu(&past.flags, LKM_GET_LVB);
|
||||
vec[1].iov_len = DLM_LVB_LEN;
|
||||
vec[1].iov_base = lock->lksb->lvb;
|
||||
veclen++;
|
||||
}
|
||||
|
||||
ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen,
|
||||
lock->ml.node, &status);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
else {
|
||||
if (status == DLM_RECOVERING) {
|
||||
mlog(ML_ERROR, "sent AST to node %u, it thinks this "
|
||||
"node is dead!\n", lock->ml.node);
|
||||
BUG();
|
||||
} else if (status == DLM_MIGRATING) {
|
||||
mlog(ML_ERROR, "sent AST to node %u, it returned "
|
||||
"DLM_MIGRATING!\n", lock->ml.node);
|
||||
BUG();
|
||||
} else if (status != DLM_NORMAL) {
|
||||
mlog(ML_ERROR, "AST to node %u returned %d!\n",
|
||||
lock->ml.node, status);
|
||||
/* ignore it */
|
||||
}
|
||||
ret = 0;
|
||||
}
|
||||
return ret;
|
||||
}
|
884
fs/ocfs2/dlm/dlmcommon.h
Normal file
884
fs/ocfs2/dlm/dlmcommon.h
Normal file
@ -0,0 +1,884 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmcommon.h
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef DLMCOMMON_H
|
||||
#define DLMCOMMON_H
|
||||
|
||||
#include <linux/kref.h>
|
||||
|
||||
#define DLM_HB_NODE_DOWN_PRI (0xf000000)
|
||||
#define DLM_HB_NODE_UP_PRI (0x8000000)
|
||||
|
||||
#define DLM_LOCKID_NAME_MAX 32
|
||||
|
||||
#define DLM_DOMAIN_NAME_MAX_LEN 255
|
||||
#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES
|
||||
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
|
||||
#define DLM_THREAD_MS 200 // flush at least every 200 ms
|
||||
|
||||
#define DLM_HASH_BITS 7
|
||||
#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
|
||||
#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
|
||||
|
||||
enum dlm_ast_type {
|
||||
DLM_AST = 0,
|
||||
DLM_BAST,
|
||||
DLM_ASTUNLOCK
|
||||
};
|
||||
|
||||
|
||||
#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \
|
||||
LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \
|
||||
LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE)
|
||||
|
||||
#define DLM_RECOVERY_LOCK_NAME "$RECOVERY"
|
||||
#define DLM_RECOVERY_LOCK_NAME_LEN 9
|
||||
|
||||
static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
|
||||
{
|
||||
if (name_len == DLM_RECOVERY_LOCK_NAME_LEN &&
|
||||
memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define DLM_RECO_STATE_ACTIVE 0x0001
|
||||
|
||||
struct dlm_recovery_ctxt
|
||||
{
|
||||
struct list_head resources;
|
||||
struct list_head received;
|
||||
struct list_head node_data;
|
||||
u8 new_master;
|
||||
u8 dead_node;
|
||||
u16 state;
|
||||
unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
wait_queue_head_t event;
|
||||
};
|
||||
|
||||
enum dlm_ctxt_state {
|
||||
DLM_CTXT_NEW = 0,
|
||||
DLM_CTXT_JOINED,
|
||||
DLM_CTXT_IN_SHUTDOWN,
|
||||
DLM_CTXT_LEAVING,
|
||||
};
|
||||
|
||||
struct dlm_ctxt
|
||||
{
|
||||
struct list_head list;
|
||||
struct list_head *resources;
|
||||
struct list_head dirty_list;
|
||||
struct list_head purge_list;
|
||||
struct list_head pending_asts;
|
||||
struct list_head pending_basts;
|
||||
unsigned int purge_count;
|
||||
spinlock_t spinlock;
|
||||
spinlock_t ast_lock;
|
||||
char *name;
|
||||
u8 node_num;
|
||||
u32 key;
|
||||
u8 joining_node;
|
||||
wait_queue_head_t dlm_join_events;
|
||||
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
struct dlm_recovery_ctxt reco;
|
||||
spinlock_t master_lock;
|
||||
struct list_head master_list;
|
||||
struct list_head mle_hb_events;
|
||||
|
||||
/* these give a really vague idea of the system load */
|
||||
atomic_t local_resources;
|
||||
atomic_t remote_resources;
|
||||
atomic_t unknown_resources;
|
||||
|
||||
/* NOTE: Next three are protected by dlm_domain_lock */
|
||||
struct kref dlm_refs;
|
||||
enum dlm_ctxt_state dlm_state;
|
||||
unsigned int num_joins;
|
||||
|
||||
struct o2hb_callback_func dlm_hb_up;
|
||||
struct o2hb_callback_func dlm_hb_down;
|
||||
struct task_struct *dlm_thread_task;
|
||||
struct task_struct *dlm_reco_thread_task;
|
||||
wait_queue_head_t dlm_thread_wq;
|
||||
wait_queue_head_t dlm_reco_thread_wq;
|
||||
wait_queue_head_t ast_wq;
|
||||
wait_queue_head_t migration_wq;
|
||||
|
||||
struct work_struct dispatched_work;
|
||||
struct list_head work_list;
|
||||
spinlock_t work_lock;
|
||||
struct list_head dlm_domain_handlers;
|
||||
struct list_head dlm_eviction_callbacks;
|
||||
};
|
||||
|
||||
/* these keventd work queue items are for less-frequently
|
||||
* called functions that cannot be directly called from the
|
||||
* net message handlers for some reason, usually because
|
||||
* they need to send net messages of their own. */
|
||||
void dlm_dispatch_work(void *data);
|
||||
|
||||
struct dlm_lock_resource;
|
||||
struct dlm_work_item;
|
||||
|
||||
typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *);
|
||||
|
||||
struct dlm_request_all_locks_priv
|
||||
{
|
||||
u8 reco_master;
|
||||
u8 dead_node;
|
||||
};
|
||||
|
||||
struct dlm_mig_lockres_priv
|
||||
{
|
||||
struct dlm_lock_resource *lockres;
|
||||
u8 real_master;
|
||||
};
|
||||
|
||||
struct dlm_assert_master_priv
|
||||
{
|
||||
struct dlm_lock_resource *lockres;
|
||||
u8 request_from;
|
||||
u32 flags;
|
||||
unsigned ignore_higher:1;
|
||||
};
|
||||
|
||||
|
||||
struct dlm_work_item
|
||||
{
|
||||
struct list_head list;
|
||||
dlm_workfunc_t *func;
|
||||
struct dlm_ctxt *dlm;
|
||||
void *data;
|
||||
union {
|
||||
struct dlm_request_all_locks_priv ral;
|
||||
struct dlm_mig_lockres_priv ml;
|
||||
struct dlm_assert_master_priv am;
|
||||
} u;
|
||||
};
|
||||
|
||||
static inline void dlm_init_work_item(struct dlm_ctxt *dlm,
|
||||
struct dlm_work_item *i,
|
||||
dlm_workfunc_t *f, void *data)
|
||||
{
|
||||
memset(i, 0, sizeof(*i));
|
||||
i->func = f;
|
||||
INIT_LIST_HEAD(&i->list);
|
||||
i->data = data;
|
||||
i->dlm = dlm; /* must have already done a dlm_grab on this! */
|
||||
}
|
||||
|
||||
|
||||
|
||||
static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
|
||||
u8 node)
|
||||
{
|
||||
assert_spin_locked(&dlm->spinlock);
|
||||
|
||||
dlm->joining_node = node;
|
||||
wake_up(&dlm->dlm_join_events);
|
||||
}
|
||||
|
||||
#define DLM_LOCK_RES_UNINITED 0x00000001
|
||||
#define DLM_LOCK_RES_RECOVERING 0x00000002
|
||||
#define DLM_LOCK_RES_READY 0x00000004
|
||||
#define DLM_LOCK_RES_DIRTY 0x00000008
|
||||
#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
|
||||
#define DLM_LOCK_RES_MIGRATING 0x00000020
|
||||
|
||||
#define DLM_PURGE_INTERVAL_MS (8 * 1000)
|
||||
|
||||
struct dlm_lock_resource
|
||||
{
|
||||
/* WARNING: Please see the comment in dlm_init_lockres before
|
||||
* adding fields here. */
|
||||
struct list_head list;
|
||||
struct kref refs;
|
||||
|
||||
/* please keep these next 3 in this order
|
||||
* some funcs want to iterate over all lists */
|
||||
struct list_head granted;
|
||||
struct list_head converting;
|
||||
struct list_head blocked;
|
||||
|
||||
struct list_head dirty;
|
||||
struct list_head recovering; // dlm_recovery_ctxt.resources list
|
||||
|
||||
/* unused lock resources have their last_used stamped and are
|
||||
* put on a list for the dlm thread to run. */
|
||||
struct list_head purge;
|
||||
unsigned long last_used;
|
||||
|
||||
unsigned migration_pending:1;
|
||||
atomic_t asts_reserved;
|
||||
spinlock_t spinlock;
|
||||
wait_queue_head_t wq;
|
||||
u8 owner; //node which owns the lock resource, or unknown
|
||||
u16 state;
|
||||
struct qstr lockname;
|
||||
char lvb[DLM_LVB_LEN];
|
||||
};
|
||||
|
||||
struct dlm_migratable_lock
|
||||
{
|
||||
__be64 cookie;
|
||||
|
||||
/* these 3 are just padding for the in-memory structure, but
|
||||
* list and flags are actually used when sent over the wire */
|
||||
__be16 pad1;
|
||||
u8 list; // 0=granted, 1=converting, 2=blocked
|
||||
u8 flags;
|
||||
|
||||
s8 type;
|
||||
s8 convert_type;
|
||||
s8 highest_blocked;
|
||||
u8 node;
|
||||
}; // 16 bytes
|
||||
|
||||
struct dlm_lock
|
||||
{
|
||||
struct dlm_migratable_lock ml;
|
||||
|
||||
struct list_head list;
|
||||
struct list_head ast_list;
|
||||
struct list_head bast_list;
|
||||
struct dlm_lock_resource *lockres;
|
||||
spinlock_t spinlock;
|
||||
struct kref lock_refs;
|
||||
|
||||
// ast and bast must be callable while holding a spinlock!
|
||||
dlm_astlockfunc_t *ast;
|
||||
dlm_bastlockfunc_t *bast;
|
||||
void *astdata;
|
||||
struct dlm_lockstatus *lksb;
|
||||
unsigned ast_pending:1,
|
||||
bast_pending:1,
|
||||
convert_pending:1,
|
||||
lock_pending:1,
|
||||
cancel_pending:1,
|
||||
unlock_pending:1,
|
||||
lksb_kernel_allocated:1;
|
||||
};
|
||||
|
||||
|
||||
#define DLM_LKSB_UNUSED1 0x01
|
||||
#define DLM_LKSB_PUT_LVB 0x02
|
||||
#define DLM_LKSB_GET_LVB 0x04
|
||||
#define DLM_LKSB_UNUSED2 0x08
|
||||
#define DLM_LKSB_UNUSED3 0x10
|
||||
#define DLM_LKSB_UNUSED4 0x20
|
||||
#define DLM_LKSB_UNUSED5 0x40
|
||||
#define DLM_LKSB_UNUSED6 0x80
|
||||
|
||||
|
||||
enum dlm_lockres_list {
|
||||
DLM_GRANTED_LIST = 0,
|
||||
DLM_CONVERTING_LIST,
|
||||
DLM_BLOCKED_LIST
|
||||
};
|
||||
|
||||
static inline struct list_head *
|
||||
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
|
||||
{
|
||||
struct list_head *ret = NULL;
|
||||
if (idx == DLM_GRANTED_LIST)
|
||||
ret = &res->granted;
|
||||
else if (idx == DLM_CONVERTING_LIST)
|
||||
ret = &res->converting;
|
||||
else if (idx == DLM_BLOCKED_LIST)
|
||||
ret = &res->blocked;
|
||||
else
|
||||
BUG();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
struct dlm_node_iter
|
||||
{
|
||||
unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
int curnode;
|
||||
};
|
||||
|
||||
|
||||
enum {
|
||||
DLM_MASTER_REQUEST_MSG = 500,
|
||||
DLM_UNUSED_MSG1, /* 501 */
|
||||
DLM_ASSERT_MASTER_MSG, /* 502 */
|
||||
DLM_CREATE_LOCK_MSG, /* 503 */
|
||||
DLM_CONVERT_LOCK_MSG, /* 504 */
|
||||
DLM_PROXY_AST_MSG, /* 505 */
|
||||
DLM_UNLOCK_LOCK_MSG, /* 506 */
|
||||
DLM_UNUSED_MSG2, /* 507 */
|
||||
DLM_MIGRATE_REQUEST_MSG, /* 508 */
|
||||
DLM_MIG_LOCKRES_MSG, /* 509 */
|
||||
DLM_QUERY_JOIN_MSG, /* 510 */
|
||||
DLM_ASSERT_JOINED_MSG, /* 511 */
|
||||
DLM_CANCEL_JOIN_MSG, /* 512 */
|
||||
DLM_EXIT_DOMAIN_MSG, /* 513 */
|
||||
DLM_MASTER_REQUERY_MSG, /* 514 */
|
||||
DLM_LOCK_REQUEST_MSG, /* 515 */
|
||||
DLM_RECO_DATA_DONE_MSG, /* 516 */
|
||||
DLM_BEGIN_RECO_MSG, /* 517 */
|
||||
DLM_FINALIZE_RECO_MSG /* 518 */
|
||||
};
|
||||
|
||||
struct dlm_reco_node_data
|
||||
{
|
||||
int state;
|
||||
u8 node_num;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
enum {
|
||||
DLM_RECO_NODE_DATA_DEAD = -1,
|
||||
DLM_RECO_NODE_DATA_INIT = 0,
|
||||
DLM_RECO_NODE_DATA_REQUESTING,
|
||||
DLM_RECO_NODE_DATA_REQUESTED,
|
||||
DLM_RECO_NODE_DATA_RECEIVING,
|
||||
DLM_RECO_NODE_DATA_DONE,
|
||||
DLM_RECO_NODE_DATA_FINALIZE_SENT,
|
||||
};
|
||||
|
||||
|
||||
enum {
|
||||
DLM_MASTER_RESP_NO = 0,
|
||||
DLM_MASTER_RESP_YES,
|
||||
DLM_MASTER_RESP_MAYBE,
|
||||
DLM_MASTER_RESP_ERROR
|
||||
};
|
||||
|
||||
|
||||
struct dlm_master_request
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 namelen;
|
||||
__be16 pad1;
|
||||
__be32 flags;
|
||||
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001
|
||||
#define DLM_ASSERT_MASTER_REQUERY 0x00000002
|
||||
#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004
|
||||
struct dlm_assert_master
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 namelen;
|
||||
__be16 pad1;
|
||||
__be32 flags;
|
||||
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
struct dlm_migrate_request
|
||||
{
|
||||
u8 master;
|
||||
u8 new_master;
|
||||
u8 namelen;
|
||||
u8 pad1;
|
||||
__be32 pad2;
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
struct dlm_master_requery
|
||||
{
|
||||
u8 pad1;
|
||||
u8 pad2;
|
||||
u8 node_idx;
|
||||
u8 namelen;
|
||||
__be32 pad3;
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
#define DLM_MRES_RECOVERY 0x01
|
||||
#define DLM_MRES_MIGRATION 0x02
|
||||
#define DLM_MRES_ALL_DONE 0x04
|
||||
|
||||
/*
|
||||
* We would like to get one whole lockres into a single network
|
||||
* message whenever possible. Generally speaking, there will be
|
||||
* at most one dlm_lock on a lockres for each node in the cluster,
|
||||
* plus (infrequently) any additional locks coming in from userdlm.
|
||||
*
|
||||
* struct _dlm_lockres_page
|
||||
* {
|
||||
* dlm_migratable_lockres mres;
|
||||
* dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS];
|
||||
* u8 pad[DLM_MIG_LOCKRES_RESERVED];
|
||||
* };
|
||||
*
|
||||
* from ../cluster/tcp.h
|
||||
* NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg))
|
||||
* (roughly 4080 bytes)
|
||||
* and sizeof(dlm_migratable_lockres) = 112 bytes
|
||||
* and sizeof(dlm_migratable_lock) = 16 bytes
|
||||
*
|
||||
* Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and
|
||||
* DLM_MIG_LOCKRES_RESERVED=128 means we have this:
|
||||
*
|
||||
* (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) +
|
||||
* sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED =
|
||||
* NET_MAX_PAYLOAD_BYTES
|
||||
* (240 * 16) + 112 + 128 = 4080
|
||||
*
|
||||
* So a lockres would need more than 240 locks before it would
|
||||
* use more than one network packet to recover. Not too bad.
|
||||
*/
|
||||
#define DLM_MAX_MIGRATABLE_LOCKS 240
|
||||
|
||||
struct dlm_migratable_lockres
|
||||
{
|
||||
u8 master;
|
||||
u8 lockname_len;
|
||||
u8 num_locks; // locks sent in this structure
|
||||
u8 flags;
|
||||
__be32 total_locks; // locks to be sent for this migration cookie
|
||||
__be64 mig_cookie; // cookie for this lockres migration
|
||||
// or zero if not needed
|
||||
// 16 bytes
|
||||
u8 lockname[DLM_LOCKID_NAME_MAX];
|
||||
// 48 bytes
|
||||
u8 lvb[DLM_LVB_LEN];
|
||||
// 112 bytes
|
||||
struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112
|
||||
};
|
||||
#define DLM_MIG_LOCKRES_MAX_LEN \
|
||||
(sizeof(struct dlm_migratable_lockres) + \
|
||||
(sizeof(struct dlm_migratable_lock) * \
|
||||
DLM_MAX_MIGRATABLE_LOCKS) )
|
||||
|
||||
/* from above, 128 bytes
|
||||
* for some undetermined future use */
|
||||
#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \
|
||||
DLM_MIG_LOCKRES_MAX_LEN)
|
||||
|
||||
struct dlm_create_lock
|
||||
{
|
||||
__be64 cookie;
|
||||
|
||||
__be32 flags;
|
||||
u8 pad1;
|
||||
u8 node_idx;
|
||||
s8 requested_type;
|
||||
u8 namelen;
|
||||
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
struct dlm_convert_lock
|
||||
{
|
||||
__be64 cookie;
|
||||
|
||||
__be32 flags;
|
||||
u8 pad1;
|
||||
u8 node_idx;
|
||||
s8 requested_type;
|
||||
u8 namelen;
|
||||
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
|
||||
s8 lvb[0];
|
||||
};
|
||||
#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN)
|
||||
|
||||
struct dlm_unlock_lock
|
||||
{
|
||||
__be64 cookie;
|
||||
|
||||
__be32 flags;
|
||||
__be16 pad1;
|
||||
u8 node_idx;
|
||||
u8 namelen;
|
||||
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
|
||||
s8 lvb[0];
|
||||
};
|
||||
#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN)
|
||||
|
||||
struct dlm_proxy_ast
|
||||
{
|
||||
__be64 cookie;
|
||||
|
||||
__be32 flags;
|
||||
u8 node_idx;
|
||||
u8 type;
|
||||
u8 blocked_type;
|
||||
u8 namelen;
|
||||
|
||||
u8 name[O2NM_MAX_NAME_LEN];
|
||||
|
||||
s8 lvb[0];
|
||||
};
|
||||
#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN)
|
||||
|
||||
#define DLM_MOD_KEY (0x666c6172)
|
||||
enum dlm_query_join_response {
|
||||
JOIN_DISALLOW = 0,
|
||||
JOIN_OK,
|
||||
JOIN_OK_NO_MAP,
|
||||
};
|
||||
|
||||
struct dlm_lock_request
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 dead_node;
|
||||
__be16 pad1;
|
||||
__be32 pad2;
|
||||
};
|
||||
|
||||
struct dlm_reco_data_done
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 dead_node;
|
||||
__be16 pad1;
|
||||
__be32 pad2;
|
||||
|
||||
/* unused for now */
|
||||
/* eventually we can use this to attempt
|
||||
* lvb recovery based on each node's info */
|
||||
u8 reco_lvb[DLM_LVB_LEN];
|
||||
};
|
||||
|
||||
struct dlm_begin_reco
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 dead_node;
|
||||
__be16 pad1;
|
||||
__be32 pad2;
|
||||
};
|
||||
|
||||
|
||||
struct dlm_query_join_request
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 pad1[2];
|
||||
u8 name_len;
|
||||
u8 domain[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
struct dlm_assert_joined
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 pad1[2];
|
||||
u8 name_len;
|
||||
u8 domain[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
struct dlm_cancel_join
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 pad1[2];
|
||||
u8 name_len;
|
||||
u8 domain[O2NM_MAX_NAME_LEN];
|
||||
};
|
||||
|
||||
struct dlm_exit_domain
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 pad1[3];
|
||||
};
|
||||
|
||||
struct dlm_finalize_reco
|
||||
{
|
||||
u8 node_idx;
|
||||
u8 dead_node;
|
||||
__be16 pad1;
|
||||
__be32 pad2;
|
||||
};
|
||||
|
||||
static inline enum dlm_status
|
||||
__dlm_lockres_state_to_status(struct dlm_lock_resource *res)
|
||||
{
|
||||
enum dlm_status status = DLM_NORMAL;
|
||||
|
||||
assert_spin_locked(&res->spinlock);
|
||||
|
||||
if (res->state & DLM_LOCK_RES_RECOVERING)
|
||||
status = DLM_RECOVERING;
|
||||
else if (res->state & DLM_LOCK_RES_MIGRATING)
|
||||
status = DLM_MIGRATING;
|
||||
else if (res->state & DLM_LOCK_RES_IN_PROGRESS)
|
||||
status = DLM_FORWARD;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
|
||||
struct dlm_lockstatus *lksb);
|
||||
void dlm_lock_get(struct dlm_lock *lock);
|
||||
void dlm_lock_put(struct dlm_lock *lock);
|
||||
|
||||
void dlm_lock_attach_lockres(struct dlm_lock *lock,
|
||||
struct dlm_lock_resource *res);
|
||||
|
||||
int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
|
||||
void dlm_revert_pending_convert(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
void dlm_revert_pending_lock(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
|
||||
int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
|
||||
int dlm_launch_thread(struct dlm_ctxt *dlm);
|
||||
void dlm_complete_thread(struct dlm_ctxt *dlm);
|
||||
int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
|
||||
void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
|
||||
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
|
||||
|
||||
void dlm_put(struct dlm_ctxt *dlm);
|
||||
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
|
||||
int dlm_domain_fully_joined(struct dlm_ctxt *dlm);
|
||||
|
||||
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res);
|
||||
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res);
|
||||
void dlm_purge_lockres(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *lockres);
|
||||
void dlm_lockres_get(struct dlm_lock_resource *res);
|
||||
void dlm_lockres_put(struct dlm_lock_resource *res);
|
||||
void __dlm_unhash_lockres(struct dlm_lock_resource *res);
|
||||
void __dlm_insert_lockres(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res);
|
||||
struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
|
||||
const char *name,
|
||||
unsigned int len);
|
||||
struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
|
||||
const char *name,
|
||||
unsigned int len);
|
||||
|
||||
int dlm_is_host_down(int errno);
|
||||
void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
u8 owner);
|
||||
struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
|
||||
const char *lockid,
|
||||
int flags);
|
||||
struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
|
||||
const char *name,
|
||||
unsigned int namelen);
|
||||
|
||||
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
|
||||
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
|
||||
void dlm_do_local_ast(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
int dlm_do_remote_ast(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock);
|
||||
void dlm_do_local_bast(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
int blocked_type);
|
||||
int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
int msg_type,
|
||||
int blocked_type, int flags);
|
||||
static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
int blocked_type)
|
||||
{
|
||||
return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST,
|
||||
blocked_type, 0);
|
||||
}
|
||||
|
||||
static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
int flags)
|
||||
{
|
||||
return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST,
|
||||
0, flags);
|
||||
}
|
||||
|
||||
void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
|
||||
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
|
||||
|
||||
u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
|
||||
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
|
||||
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
|
||||
|
||||
|
||||
int dlm_nm_init(struct dlm_ctxt *dlm);
|
||||
int dlm_heartbeat_init(struct dlm_ctxt *dlm);
|
||||
void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
|
||||
void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
|
||||
|
||||
int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
|
||||
int dlm_migrate_lockres(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
u8 target);
|
||||
int dlm_finish_migration(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
u8 old_master);
|
||||
void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res);
|
||||
void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res);
|
||||
|
||||
int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
|
||||
|
||||
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
int ignore_higher,
|
||||
u8 request_from,
|
||||
u32 flags);
|
||||
|
||||
|
||||
int dlm_send_one_lockres(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_migratable_lockres *mres,
|
||||
u8 send_to,
|
||||
u8 flags);
|
||||
void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res);
|
||||
|
||||
/* will exit holding res->spinlock, but may drop in function */
|
||||
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
|
||||
void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags);
|
||||
|
||||
/* will exit holding res->spinlock, but may drop in function */
|
||||
static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
|
||||
{
|
||||
__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
|
||||
DLM_LOCK_RES_RECOVERING|
|
||||
DLM_LOCK_RES_MIGRATING));
|
||||
}
|
||||
|
||||
|
||||
int dlm_init_mle_cache(void);
|
||||
void dlm_destroy_mle_cache(void);
|
||||
void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
|
||||
void dlm_clean_master_list(struct dlm_ctxt *dlm,
|
||||
u8 dead_node);
|
||||
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
|
||||
|
||||
|
||||
static inline const char * dlm_lock_mode_name(int mode)
|
||||
{
|
||||
switch (mode) {
|
||||
case LKM_EXMODE:
|
||||
return "EX";
|
||||
case LKM_PRMODE:
|
||||
return "PR";
|
||||
case LKM_NLMODE:
|
||||
return "NL";
|
||||
}
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
|
||||
static inline int dlm_lock_compatible(int existing, int request)
|
||||
{
|
||||
/* NO_LOCK compatible with all */
|
||||
if (request == LKM_NLMODE ||
|
||||
existing == LKM_NLMODE)
|
||||
return 1;
|
||||
|
||||
/* EX incompatible with all non-NO_LOCK */
|
||||
if (request == LKM_EXMODE)
|
||||
return 0;
|
||||
|
||||
/* request must be PR, which is compatible with PR */
|
||||
if (existing == LKM_PRMODE)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int dlm_lock_on_list(struct list_head *head,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
struct list_head *iter;
|
||||
struct dlm_lock *tmplock;
|
||||
|
||||
list_for_each(iter, head) {
|
||||
tmplock = list_entry(iter, struct dlm_lock, list);
|
||||
if (tmplock == lock)
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static inline enum dlm_status dlm_err_to_dlm_status(int err)
|
||||
{
|
||||
enum dlm_status ret;
|
||||
if (err == -ENOMEM)
|
||||
ret = DLM_SYSERR;
|
||||
else if (err == -ETIMEDOUT || o2net_link_down(err, NULL))
|
||||
ret = DLM_NOLOCKMGR;
|
||||
else if (err == -EINVAL)
|
||||
ret = DLM_BADPARAM;
|
||||
else if (err == -ENAMETOOLONG)
|
||||
ret = DLM_IVBUFLEN;
|
||||
else
|
||||
ret = DLM_BADARGS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static inline void dlm_node_iter_init(unsigned long *map,
|
||||
struct dlm_node_iter *iter)
|
||||
{
|
||||
memcpy(iter->node_map, map, sizeof(iter->node_map));
|
||||
iter->curnode = -1;
|
||||
}
|
||||
|
||||
static inline int dlm_node_iter_next(struct dlm_node_iter *iter)
|
||||
{
|
||||
int bit;
|
||||
bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1);
|
||||
if (bit >= O2NM_MAX_NODES) {
|
||||
iter->curnode = O2NM_MAX_NODES;
|
||||
return -ENOENT;
|
||||
}
|
||||
iter->curnode = bit;
|
||||
return bit;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif /* DLMCOMMON_H */
|
530
fs/ocfs2/dlm/dlmconvert.c
Normal file
530
fs/ocfs2/dlm/dlmconvert.c
Normal file
@ -0,0 +1,530 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmconvert.c
|
||||
*
|
||||
* underlying calls for lock conversion
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
#include "dlmcommon.h"
|
||||
|
||||
#include "dlmconvert.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLM
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
/* NOTE: __dlmconvert_master is the only function in here that
|
||||
* needs a spinlock held on entry (res->spinlock) and it is the
|
||||
* only one that holds a lock on exit (res->spinlock).
|
||||
* All other functions in here need no locks and drop all of
|
||||
* the locks that they acquire. */
|
||||
static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags,
|
||||
int type, int *call_ast,
|
||||
int *kick_thread);
|
||||
static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags, int type);
|
||||
|
||||
/*
|
||||
* this is only called directly by dlmlock(), and only when the
|
||||
* local node is the owner of the lockres
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drops res->spinlock
|
||||
* held on exit: none
|
||||
* returns: see __dlmconvert_master
|
||||
*/
|
||||
enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags, int type)
|
||||
{
|
||||
int call_ast = 0, kick_thread = 0;
|
||||
enum dlm_status status;
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
/* we are not in a network handler, this is fine */
|
||||
__dlm_wait_on_lockres(res);
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
res->state |= DLM_LOCK_RES_IN_PROGRESS;
|
||||
|
||||
status = __dlmconvert_master(dlm, res, lock, flags, type,
|
||||
&call_ast, &kick_thread);
|
||||
|
||||
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
|
||||
spin_unlock(&res->spinlock);
|
||||
wake_up(&res->wq);
|
||||
if (status != DLM_NORMAL && status != DLM_NOTQUEUED)
|
||||
dlm_error(status);
|
||||
|
||||
/* either queue the ast or release it */
|
||||
if (call_ast)
|
||||
dlm_queue_ast(dlm, lock);
|
||||
else
|
||||
dlm_lockres_release_ast(dlm, res);
|
||||
|
||||
if (kick_thread)
|
||||
dlm_kick_thread(dlm, res);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/* performs lock conversion at the lockres master site
|
||||
* locking:
|
||||
* caller needs: res->spinlock
|
||||
* taken: takes and drops lock->spinlock
|
||||
* held on exit: res->spinlock
|
||||
* returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED
|
||||
* call_ast: whether ast should be called for this lock
|
||||
* kick_thread: whether dlm_kick_thread should be called
|
||||
*/
|
||||
static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags,
|
||||
int type, int *call_ast,
|
||||
int *kick_thread)
|
||||
{
|
||||
enum dlm_status status = DLM_NORMAL;
|
||||
struct list_head *iter;
|
||||
struct dlm_lock *tmplock=NULL;
|
||||
|
||||
assert_spin_locked(&res->spinlock);
|
||||
|
||||
mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
|
||||
lock->ml.type, lock->ml.convert_type, type);
|
||||
|
||||
spin_lock(&lock->spinlock);
|
||||
|
||||
/* already converting? */
|
||||
if (lock->ml.convert_type != LKM_IVMODE) {
|
||||
mlog(ML_ERROR, "attempted to convert a lock with a lock "
|
||||
"conversion pending\n");
|
||||
status = DLM_DENIED;
|
||||
goto unlock_exit;
|
||||
}
|
||||
|
||||
/* must be on grant queue to convert */
|
||||
if (!dlm_lock_on_list(&res->granted, lock)) {
|
||||
mlog(ML_ERROR, "attempted to convert a lock not on grant "
|
||||
"queue\n");
|
||||
status = DLM_DENIED;
|
||||
goto unlock_exit;
|
||||
}
|
||||
|
||||
if (flags & LKM_VALBLK) {
|
||||
switch (lock->ml.type) {
|
||||
case LKM_EXMODE:
|
||||
/* EX + LKM_VALBLK + convert == set lvb */
|
||||
mlog(0, "will set lvb: converting %s->%s\n",
|
||||
dlm_lock_mode_name(lock->ml.type),
|
||||
dlm_lock_mode_name(type));
|
||||
lock->lksb->flags |= DLM_LKSB_PUT_LVB;
|
||||
break;
|
||||
case LKM_PRMODE:
|
||||
case LKM_NLMODE:
|
||||
/* refetch if new level is not NL */
|
||||
if (type > LKM_NLMODE) {
|
||||
mlog(0, "will fetch new value into "
|
||||
"lvb: converting %s->%s\n",
|
||||
dlm_lock_mode_name(lock->ml.type),
|
||||
dlm_lock_mode_name(type));
|
||||
lock->lksb->flags |= DLM_LKSB_GET_LVB;
|
||||
} else {
|
||||
mlog(0, "will NOT fetch new value "
|
||||
"into lvb: converting %s->%s\n",
|
||||
dlm_lock_mode_name(lock->ml.type),
|
||||
dlm_lock_mode_name(type));
|
||||
flags &= ~(LKM_VALBLK);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* in-place downconvert? */
|
||||
if (type <= lock->ml.type)
|
||||
goto grant;
|
||||
|
||||
/* upconvert from here on */
|
||||
status = DLM_NORMAL;
|
||||
list_for_each(iter, &res->granted) {
|
||||
tmplock = list_entry(iter, struct dlm_lock, list);
|
||||
if (tmplock == lock)
|
||||
continue;
|
||||
if (!dlm_lock_compatible(tmplock->ml.type, type))
|
||||
goto switch_queues;
|
||||
}
|
||||
|
||||
list_for_each(iter, &res->converting) {
|
||||
tmplock = list_entry(iter, struct dlm_lock, list);
|
||||
if (!dlm_lock_compatible(tmplock->ml.type, type))
|
||||
goto switch_queues;
|
||||
/* existing conversion requests take precedence */
|
||||
if (!dlm_lock_compatible(tmplock->ml.convert_type, type))
|
||||
goto switch_queues;
|
||||
}
|
||||
|
||||
/* fall thru to grant */
|
||||
|
||||
grant:
|
||||
mlog(0, "res %.*s, granting %s lock\n", res->lockname.len,
|
||||
res->lockname.name, dlm_lock_mode_name(type));
|
||||
/* immediately grant the new lock type */
|
||||
lock->lksb->status = DLM_NORMAL;
|
||||
if (lock->ml.node == dlm->node_num)
|
||||
mlog(0, "doing in-place convert for nonlocal lock\n");
|
||||
lock->ml.type = type;
|
||||
status = DLM_NORMAL;
|
||||
*call_ast = 1;
|
||||
goto unlock_exit;
|
||||
|
||||
switch_queues:
|
||||
if (flags & LKM_NOQUEUE) {
|
||||
mlog(0, "failed to convert NOQUEUE lock %.*s from "
|
||||
"%d to %d...\n", res->lockname.len, res->lockname.name,
|
||||
lock->ml.type, type);
|
||||
status = DLM_NOTQUEUED;
|
||||
goto unlock_exit;
|
||||
}
|
||||
mlog(0, "res %.*s, queueing...\n", res->lockname.len,
|
||||
res->lockname.name);
|
||||
|
||||
lock->ml.convert_type = type;
|
||||
/* do not alter lock refcount. switching lists. */
|
||||
list_del_init(&lock->list);
|
||||
list_add_tail(&lock->list, &res->converting);
|
||||
|
||||
unlock_exit:
|
||||
spin_unlock(&lock->spinlock);
|
||||
if (status == DLM_DENIED) {
|
||||
__dlm_print_one_lock_resource(res);
|
||||
}
|
||||
if (status == DLM_NORMAL)
|
||||
*kick_thread = 1;
|
||||
return status;
|
||||
}
|
||||
|
||||
void dlm_revert_pending_convert(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
/* do not alter lock refcount. switching lists. */
|
||||
list_del_init(&lock->list);
|
||||
list_add_tail(&lock->list, &res->granted);
|
||||
lock->ml.convert_type = LKM_IVMODE;
|
||||
lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
|
||||
}
|
||||
|
||||
/* messages the master site to do lock conversion
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_RECOVERING, status from remote node
|
||||
*/
|
||||
enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags, int type)
|
||||
{
|
||||
enum dlm_status status;
|
||||
|
||||
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
|
||||
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
if (res->state & DLM_LOCK_RES_RECOVERING) {
|
||||
mlog(0, "bailing out early since res is RECOVERING "
|
||||
"on secondary queue\n");
|
||||
/* __dlm_print_one_lock_resource(res); */
|
||||
status = DLM_RECOVERING;
|
||||
goto bail;
|
||||
}
|
||||
/* will exit this call with spinlock held */
|
||||
__dlm_wait_on_lockres(res);
|
||||
|
||||
if (lock->ml.convert_type != LKM_IVMODE) {
|
||||
__dlm_print_one_lock_resource(res);
|
||||
mlog(ML_ERROR, "converting a remote lock that is already "
|
||||
"converting! (cookie=%"MLFu64", conv=%d)\n",
|
||||
lock->ml.cookie, lock->ml.convert_type);
|
||||
status = DLM_DENIED;
|
||||
goto bail;
|
||||
}
|
||||
res->state |= DLM_LOCK_RES_IN_PROGRESS;
|
||||
/* move lock to local convert queue */
|
||||
/* do not alter lock refcount. switching lists. */
|
||||
list_del_init(&lock->list);
|
||||
list_add_tail(&lock->list, &res->converting);
|
||||
lock->convert_pending = 1;
|
||||
lock->ml.convert_type = type;
|
||||
|
||||
if (flags & LKM_VALBLK) {
|
||||
if (lock->ml.type == LKM_EXMODE) {
|
||||
flags |= LKM_PUT_LVB;
|
||||
lock->lksb->flags |= DLM_LKSB_PUT_LVB;
|
||||
} else {
|
||||
if (lock->ml.convert_type == LKM_NLMODE)
|
||||
flags &= ~LKM_VALBLK;
|
||||
else {
|
||||
flags |= LKM_GET_LVB;
|
||||
lock->lksb->flags |= DLM_LKSB_GET_LVB;
|
||||
}
|
||||
}
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
/* no locks held here.
|
||||
* need to wait for a reply as to whether it got queued or not. */
|
||||
status = dlm_send_remote_convert_request(dlm, res, lock, flags, type);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
|
||||
lock->convert_pending = 0;
|
||||
/* if it failed, move it back to granted queue */
|
||||
if (status != DLM_NORMAL) {
|
||||
if (status != DLM_NOTQUEUED)
|
||||
dlm_error(status);
|
||||
dlm_revert_pending_convert(res, lock);
|
||||
}
|
||||
bail:
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
/* TODO: should this be a wake_one? */
|
||||
/* wake up any IN_PROGRESS waiters */
|
||||
wake_up(&res->wq);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/* sends DLM_CONVERT_LOCK_MSG to master site
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: none
|
||||
* held on exit: none
|
||||
* returns: DLM_NOLOCKMGR, status from remote node
|
||||
*/
|
||||
static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags, int type)
|
||||
{
|
||||
struct dlm_convert_lock convert;
|
||||
int tmpret;
|
||||
enum dlm_status ret;
|
||||
int status = 0;
|
||||
struct kvec vec[2];
|
||||
size_t veclen = 1;
|
||||
|
||||
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
|
||||
|
||||
memset(&convert, 0, sizeof(struct dlm_convert_lock));
|
||||
convert.node_idx = dlm->node_num;
|
||||
convert.requested_type = type;
|
||||
convert.cookie = lock->ml.cookie;
|
||||
convert.namelen = res->lockname.len;
|
||||
convert.flags = cpu_to_be32(flags);
|
||||
memcpy(convert.name, res->lockname.name, convert.namelen);
|
||||
|
||||
vec[0].iov_len = sizeof(struct dlm_convert_lock);
|
||||
vec[0].iov_base = &convert;
|
||||
|
||||
if (flags & LKM_PUT_LVB) {
|
||||
/* extra data to send if we are updating lvb */
|
||||
vec[1].iov_len = DLM_LVB_LEN;
|
||||
vec[1].iov_base = lock->lksb->lvb;
|
||||
veclen++;
|
||||
}
|
||||
|
||||
tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key,
|
||||
vec, veclen, res->owner, &status);
|
||||
if (tmpret >= 0) {
|
||||
// successfully sent and received
|
||||
ret = status; // this is already a dlm_status
|
||||
if (ret == DLM_RECOVERING) {
|
||||
mlog(0, "node %u returned DLM_RECOVERING from convert "
|
||||
"message!\n", res->owner);
|
||||
} else if (ret == DLM_MIGRATING) {
|
||||
mlog(0, "node %u returned DLM_MIGRATING from convert "
|
||||
"message!\n", res->owner);
|
||||
} else if (ret == DLM_FORWARD) {
|
||||
mlog(0, "node %u returned DLM_FORWARD from convert "
|
||||
"message!\n", res->owner);
|
||||
} else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED)
|
||||
dlm_error(ret);
|
||||
} else {
|
||||
mlog_errno(tmpret);
|
||||
if (dlm_is_host_down(tmpret)) {
|
||||
ret = DLM_RECOVERING;
|
||||
mlog(0, "node %u died so returning DLM_RECOVERING "
|
||||
"from convert message!\n", res->owner);
|
||||
} else {
|
||||
ret = dlm_err_to_dlm_status(tmpret);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* handler for DLM_CONVERT_LOCK_MSG on master site
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drop res->spinlock
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS,
|
||||
* status from __dlmconvert_master
|
||||
*/
|
||||
int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
|
||||
{
|
||||
struct dlm_ctxt *dlm = data;
|
||||
struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
|
||||
struct dlm_lock_resource *res = NULL;
|
||||
struct list_head *iter;
|
||||
struct dlm_lock *lock = NULL;
|
||||
struct dlm_lockstatus *lksb;
|
||||
enum dlm_status status = DLM_NORMAL;
|
||||
u32 flags;
|
||||
int call_ast = 0, kick_thread = 0;
|
||||
|
||||
if (!dlm_grab(dlm)) {
|
||||
dlm_error(DLM_REJECTED);
|
||||
return DLM_REJECTED;
|
||||
}
|
||||
|
||||
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
|
||||
"Domain %s not fully joined!\n", dlm->name);
|
||||
|
||||
if (cnv->namelen > DLM_LOCKID_NAME_MAX) {
|
||||
status = DLM_IVBUFLEN;
|
||||
dlm_error(status);
|
||||
goto leave;
|
||||
}
|
||||
|
||||
flags = be32_to_cpu(cnv->flags);
|
||||
|
||||
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
|
||||
(LKM_PUT_LVB|LKM_GET_LVB)) {
|
||||
mlog(ML_ERROR, "both PUT and GET lvb specified\n");
|
||||
status = DLM_BADARGS;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" :
|
||||
(flags & LKM_GET_LVB ? "get lvb" : "none"));
|
||||
|
||||
status = DLM_IVLOCKID;
|
||||
res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen);
|
||||
if (!res) {
|
||||
dlm_error(status);
|
||||
goto leave;
|
||||
}
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
list_for_each(iter, &res->granted) {
|
||||
lock = list_entry(iter, struct dlm_lock, list);
|
||||
if (lock->ml.cookie == cnv->cookie &&
|
||||
lock->ml.node == cnv->node_idx) {
|
||||
dlm_lock_get(lock);
|
||||
break;
|
||||
}
|
||||
lock = NULL;
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
if (!lock) {
|
||||
status = DLM_IVLOCKID;
|
||||
dlm_error(status);
|
||||
goto leave;
|
||||
}
|
||||
|
||||
/* found the lock */
|
||||
lksb = lock->lksb;
|
||||
|
||||
/* see if caller needed to get/put lvb */
|
||||
if (flags & LKM_PUT_LVB) {
|
||||
BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
|
||||
lksb->flags |= DLM_LKSB_PUT_LVB;
|
||||
memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN);
|
||||
} else if (flags & LKM_GET_LVB) {
|
||||
BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
|
||||
lksb->flags |= DLM_LKSB_GET_LVB;
|
||||
}
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
status = __dlm_lockres_state_to_status(res);
|
||||
if (status == DLM_NORMAL) {
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
res->state |= DLM_LOCK_RES_IN_PROGRESS;
|
||||
status = __dlmconvert_master(dlm, res, lock, flags,
|
||||
cnv->requested_type,
|
||||
&call_ast, &kick_thread);
|
||||
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
if (status != DLM_NORMAL) {
|
||||
if (status != DLM_NOTQUEUED)
|
||||
dlm_error(status);
|
||||
lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
|
||||
}
|
||||
|
||||
leave:
|
||||
if (!lock)
|
||||
mlog(ML_ERROR, "did not find lock to convert on grant queue! "
|
||||
"cookie=%"MLFu64"\n",
|
||||
cnv->cookie);
|
||||
else
|
||||
dlm_lock_put(lock);
|
||||
|
||||
/* either queue the ast or release it */
|
||||
if (call_ast)
|
||||
dlm_queue_ast(dlm, lock);
|
||||
else
|
||||
dlm_lockres_release_ast(dlm, res);
|
||||
|
||||
if (kick_thread)
|
||||
dlm_kick_thread(dlm, res);
|
||||
|
||||
if (res)
|
||||
dlm_lockres_put(res);
|
||||
|
||||
dlm_put(dlm);
|
||||
|
||||
return status;
|
||||
}
|
35
fs/ocfs2/dlm/dlmconvert.h
Normal file
35
fs/ocfs2/dlm/dlmconvert.h
Normal file
@ -0,0 +1,35 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmconvert.h
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef DLMCONVERT_H
|
||||
#define DLMCONVERT_H
|
||||
|
||||
enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags, int type);
|
||||
enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags, int type);
|
||||
|
||||
#endif
|
246
fs/ocfs2/dlm/dlmdebug.c
Normal file
246
fs/ocfs2/dlm/dlmdebug.c
Normal file
@ -0,0 +1,246 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmdebug.c
|
||||
*
|
||||
* debug functionality for the dlm
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
#include "dlmcommon.h"
|
||||
#include "dlmdebug.h"
|
||||
|
||||
#include "dlmdomain.h"
|
||||
#include "dlmdebug.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLM
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
|
||||
{
|
||||
mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
|
||||
res->lockname.len, res->lockname.name,
|
||||
res->owner, res->state);
|
||||
spin_lock(&res->spinlock);
|
||||
__dlm_print_one_lock_resource(res);
|
||||
spin_unlock(&res->spinlock);
|
||||
}
|
||||
|
||||
void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
|
||||
{
|
||||
struct list_head *iter2;
|
||||
struct dlm_lock *lock;
|
||||
|
||||
assert_spin_locked(&res->spinlock);
|
||||
|
||||
mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
|
||||
res->lockname.len, res->lockname.name,
|
||||
res->owner, res->state);
|
||||
mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n",
|
||||
res->last_used, list_empty(&res->purge) ? "no" : "yes");
|
||||
mlog(ML_NOTICE, " granted queue: \n");
|
||||
list_for_each(iter2, &res->granted) {
|
||||
lock = list_entry(iter2, struct dlm_lock, list);
|
||||
spin_lock(&lock->spinlock);
|
||||
mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
|
||||
"cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
|
||||
lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
|
||||
list_empty(&lock->ast_list) ? 'y' : 'n',
|
||||
lock->ast_pending ? 'y' : 'n',
|
||||
list_empty(&lock->bast_list) ? 'y' : 'n',
|
||||
lock->bast_pending ? 'y' : 'n');
|
||||
spin_unlock(&lock->spinlock);
|
||||
}
|
||||
mlog(ML_NOTICE, " converting queue: \n");
|
||||
list_for_each(iter2, &res->converting) {
|
||||
lock = list_entry(iter2, struct dlm_lock, list);
|
||||
spin_lock(&lock->spinlock);
|
||||
mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
|
||||
"cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
|
||||
lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
|
||||
list_empty(&lock->ast_list) ? 'y' : 'n',
|
||||
lock->ast_pending ? 'y' : 'n',
|
||||
list_empty(&lock->bast_list) ? 'y' : 'n',
|
||||
lock->bast_pending ? 'y' : 'n');
|
||||
spin_unlock(&lock->spinlock);
|
||||
}
|
||||
mlog(ML_NOTICE, " blocked queue: \n");
|
||||
list_for_each(iter2, &res->blocked) {
|
||||
lock = list_entry(iter2, struct dlm_lock, list);
|
||||
spin_lock(&lock->spinlock);
|
||||
mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
|
||||
"cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
|
||||
lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie,
|
||||
list_empty(&lock->ast_list) ? 'y' : 'n',
|
||||
lock->ast_pending ? 'y' : 'n',
|
||||
list_empty(&lock->bast_list) ? 'y' : 'n',
|
||||
lock->bast_pending ? 'y' : 'n');
|
||||
spin_unlock(&lock->spinlock);
|
||||
}
|
||||
}
|
||||
|
||||
void dlm_print_one_lock(struct dlm_lock *lockid)
|
||||
{
|
||||
dlm_print_one_lock_resource(lockid->lockres);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dlm_print_one_lock);
|
||||
|
||||
void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
|
||||
{
|
||||
struct dlm_lock_resource *res;
|
||||
struct list_head *iter;
|
||||
struct list_head *bucket;
|
||||
int i;
|
||||
|
||||
mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
|
||||
dlm->name, dlm->node_num, dlm->key);
|
||||
if (!dlm || !dlm->name) {
|
||||
mlog(ML_ERROR, "dlm=%p\n", dlm);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
for (i=0; i<DLM_HASH_SIZE; i++) {
|
||||
bucket = &(dlm->resources[i]);
|
||||
list_for_each(iter, bucket) {
|
||||
res = list_entry(iter, struct dlm_lock_resource, list);
|
||||
dlm_print_one_lock_resource(res);
|
||||
}
|
||||
}
|
||||
spin_unlock(&dlm->spinlock);
|
||||
}
|
||||
|
||||
static const char *dlm_errnames[] = {
|
||||
[DLM_NORMAL] = "DLM_NORMAL",
|
||||
[DLM_GRANTED] = "DLM_GRANTED",
|
||||
[DLM_DENIED] = "DLM_DENIED",
|
||||
[DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS",
|
||||
[DLM_WORKING] = "DLM_WORKING",
|
||||
[DLM_BLOCKED] = "DLM_BLOCKED",
|
||||
[DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN",
|
||||
[DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD",
|
||||
[DLM_SYSERR] = "DLM_SYSERR",
|
||||
[DLM_NOSUPPORT] = "DLM_NOSUPPORT",
|
||||
[DLM_CANCELGRANT] = "DLM_CANCELGRANT",
|
||||
[DLM_IVLOCKID] = "DLM_IVLOCKID",
|
||||
[DLM_SYNC] = "DLM_SYNC",
|
||||
[DLM_BADTYPE] = "DLM_BADTYPE",
|
||||
[DLM_BADRESOURCE] = "DLM_BADRESOURCE",
|
||||
[DLM_MAXHANDLES] = "DLM_MAXHANDLES",
|
||||
[DLM_NOCLINFO] = "DLM_NOCLINFO",
|
||||
[DLM_NOLOCKMGR] = "DLM_NOLOCKMGR",
|
||||
[DLM_NOPURGED] = "DLM_NOPURGED",
|
||||
[DLM_BADARGS] = "DLM_BADARGS",
|
||||
[DLM_VOID] = "DLM_VOID",
|
||||
[DLM_NOTQUEUED] = "DLM_NOTQUEUED",
|
||||
[DLM_IVBUFLEN] = "DLM_IVBUFLEN",
|
||||
[DLM_CVTUNGRANT] = "DLM_CVTUNGRANT",
|
||||
[DLM_BADPARAM] = "DLM_BADPARAM",
|
||||
[DLM_VALNOTVALID] = "DLM_VALNOTVALID",
|
||||
[DLM_REJECTED] = "DLM_REJECTED",
|
||||
[DLM_ABORT] = "DLM_ABORT",
|
||||
[DLM_CANCEL] = "DLM_CANCEL",
|
||||
[DLM_IVRESHANDLE] = "DLM_IVRESHANDLE",
|
||||
[DLM_DEADLOCK] = "DLM_DEADLOCK",
|
||||
[DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS",
|
||||
[DLM_FORWARD] = "DLM_FORWARD",
|
||||
[DLM_TIMEOUT] = "DLM_TIMEOUT",
|
||||
[DLM_IVGROUPID] = "DLM_IVGROUPID",
|
||||
[DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT",
|
||||
[DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH",
|
||||
[DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION",
|
||||
[DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ",
|
||||
[DLM_RECOVERING] = "DLM_RECOVERING",
|
||||
[DLM_MIGRATING] = "DLM_MIGRATING",
|
||||
[DLM_MAXSTATS] = "DLM_MAXSTATS",
|
||||
};
|
||||
|
||||
static const char *dlm_errmsgs[] = {
|
||||
[DLM_NORMAL] = "request in progress",
|
||||
[DLM_GRANTED] = "request granted",
|
||||
[DLM_DENIED] = "request denied",
|
||||
[DLM_DENIED_NOLOCKS] = "request denied, out of system resources",
|
||||
[DLM_WORKING] = "async request in progress",
|
||||
[DLM_BLOCKED] = "lock request blocked",
|
||||
[DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock",
|
||||
[DLM_DENIED_GRACE_PERIOD] = "topological change in progress",
|
||||
[DLM_SYSERR] = "system error",
|
||||
[DLM_NOSUPPORT] = "unsupported",
|
||||
[DLM_CANCELGRANT] = "can't cancel convert: already granted",
|
||||
[DLM_IVLOCKID] = "bad lockid",
|
||||
[DLM_SYNC] = "synchronous request granted",
|
||||
[DLM_BADTYPE] = "bad resource type",
|
||||
[DLM_BADRESOURCE] = "bad resource handle",
|
||||
[DLM_MAXHANDLES] = "no more resource handles",
|
||||
[DLM_NOCLINFO] = "can't contact cluster manager",
|
||||
[DLM_NOLOCKMGR] = "can't contact lock manager",
|
||||
[DLM_NOPURGED] = "can't contact purge daemon",
|
||||
[DLM_BADARGS] = "bad api args",
|
||||
[DLM_VOID] = "no status",
|
||||
[DLM_NOTQUEUED] = "NOQUEUE was specified and request failed",
|
||||
[DLM_IVBUFLEN] = "invalid resource name length",
|
||||
[DLM_CVTUNGRANT] = "attempted to convert ungranted lock",
|
||||
[DLM_BADPARAM] = "invalid lock mode specified",
|
||||
[DLM_VALNOTVALID] = "value block has been invalidated",
|
||||
[DLM_REJECTED] = "request rejected, unrecognized client",
|
||||
[DLM_ABORT] = "blocked lock request cancelled",
|
||||
[DLM_CANCEL] = "conversion request cancelled",
|
||||
[DLM_IVRESHANDLE] = "invalid resource handle",
|
||||
[DLM_DEADLOCK] = "deadlock recovery refused this request",
|
||||
[DLM_DENIED_NOASTS] = "failed to allocate AST",
|
||||
[DLM_FORWARD] = "request must wait for primary's response",
|
||||
[DLM_TIMEOUT] = "timeout value for lock has expired",
|
||||
[DLM_IVGROUPID] = "invalid group specification",
|
||||
[DLM_VERS_CONFLICT] = "version conflicts prevent request handling",
|
||||
[DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong",
|
||||
[DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device",
|
||||
[DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ",
|
||||
[DLM_RECOVERING] = "lock resource being recovered",
|
||||
[DLM_MIGRATING] = "lock resource being migrated",
|
||||
[DLM_MAXSTATS] = "invalid error number",
|
||||
};
|
||||
|
||||
const char *dlm_errmsg(enum dlm_status err)
|
||||
{
|
||||
if (err >= DLM_MAXSTATS || err < 0)
|
||||
return dlm_errmsgs[DLM_MAXSTATS];
|
||||
return dlm_errmsgs[err];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dlm_errmsg);
|
||||
|
||||
const char *dlm_errname(enum dlm_status err)
|
||||
{
|
||||
if (err >= DLM_MAXSTATS || err < 0)
|
||||
return dlm_errnames[DLM_MAXSTATS];
|
||||
return dlm_errnames[err];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dlm_errname);
|
30
fs/ocfs2/dlm/dlmdebug.h
Normal file
30
fs/ocfs2/dlm/dlmdebug.h
Normal file
@ -0,0 +1,30 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmdebug.h
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef DLMDEBUG_H
|
||||
#define DLMDEBUG_H
|
||||
|
||||
void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
|
||||
|
||||
#endif
|
1469
fs/ocfs2/dlm/dlmdomain.c
Normal file
1469
fs/ocfs2/dlm/dlmdomain.c
Normal file
File diff suppressed because it is too large
Load Diff
36
fs/ocfs2/dlm/dlmdomain.h
Normal file
36
fs/ocfs2/dlm/dlmdomain.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmdomain.h
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef DLMDOMAIN_H
|
||||
#define DLMDOMAIN_H
|
||||
|
||||
extern spinlock_t dlm_domain_lock;
|
||||
extern struct list_head dlm_domains;
|
||||
|
||||
int dlm_joined(struct dlm_ctxt *dlm);
|
||||
int dlm_shutting_down(struct dlm_ctxt *dlm);
|
||||
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
|
||||
int node_num);
|
||||
|
||||
#endif
|
640
fs/ocfs2/dlm/dlmfs.c
Normal file
640
fs/ocfs2/dlm/dlmfs.c
Normal file
@ -0,0 +1,640 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmfs.c
|
||||
*
|
||||
* Code which implements the kernel side of a minimal userspace
|
||||
* interface to our DLM. This file handles the virtual file system
|
||||
* used for communication with userspace. Credit should go to ramfs,
|
||||
* which was a template for the fs side of this module.
|
||||
*
|
||||
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
/* Simple VFS hooks based on: */
|
||||
/*
|
||||
* Resizable simple ram filesystem for Linux.
|
||||
*
|
||||
* Copyright (C) 2000 Linus Torvalds.
|
||||
* 2000 Transmeta Corp.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/smp_lock.h>
|
||||
#include <linux/backing-dev.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
|
||||
#include "userdlm.h"
|
||||
|
||||
#include "dlmfsver.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLMFS
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
static struct super_operations dlmfs_ops;
|
||||
static struct file_operations dlmfs_file_operations;
|
||||
static struct inode_operations dlmfs_dir_inode_operations;
|
||||
static struct inode_operations dlmfs_root_inode_operations;
|
||||
static struct inode_operations dlmfs_file_inode_operations;
|
||||
static kmem_cache_t *dlmfs_inode_cache;
|
||||
|
||||
struct workqueue_struct *user_dlm_worker;
|
||||
|
||||
/*
|
||||
* decodes a set of open flags into a valid lock level and a set of flags.
|
||||
* returns < 0 if we have invalid flags
|
||||
* flags which mean something to us:
|
||||
* O_RDONLY -> PRMODE level
|
||||
* O_WRONLY -> EXMODE level
|
||||
*
|
||||
* O_NONBLOCK -> LKM_NOQUEUE
|
||||
*/
|
||||
static int dlmfs_decode_open_flags(int open_flags,
|
||||
int *level,
|
||||
int *flags)
|
||||
{
|
||||
if (open_flags & (O_WRONLY|O_RDWR))
|
||||
*level = LKM_EXMODE;
|
||||
else
|
||||
*level = LKM_PRMODE;
|
||||
|
||||
*flags = 0;
|
||||
if (open_flags & O_NONBLOCK)
|
||||
*flags |= LKM_NOQUEUE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dlmfs_file_open(struct inode *inode,
|
||||
struct file *file)
|
||||
{
|
||||
int status, level, flags;
|
||||
struct dlmfs_filp_private *fp = NULL;
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
BUG();
|
||||
|
||||
mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino,
|
||||
file->f_flags);
|
||||
|
||||
status = dlmfs_decode_open_flags(file->f_flags, &level, &flags);
|
||||
if (status < 0)
|
||||
goto bail;
|
||||
|
||||
/* We don't want to honor O_APPEND at read/write time as it
|
||||
* doesn't make sense for LVB writes. */
|
||||
file->f_flags &= ~O_APPEND;
|
||||
|
||||
fp = kmalloc(sizeof(*fp), GFP_KERNEL);
|
||||
if (!fp) {
|
||||
status = -ENOMEM;
|
||||
goto bail;
|
||||
}
|
||||
fp->fp_lock_level = level;
|
||||
|
||||
ip = DLMFS_I(inode);
|
||||
|
||||
status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags);
|
||||
if (status < 0) {
|
||||
/* this is a strange error to return here but I want
|
||||
* to be able userspace to be able to distinguish a
|
||||
* valid lock request from one that simply couldn't be
|
||||
* granted. */
|
||||
if (flags & LKM_NOQUEUE && status == -EAGAIN)
|
||||
status = -ETXTBSY;
|
||||
kfree(fp);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
file->private_data = fp;
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
static int dlmfs_file_release(struct inode *inode,
|
||||
struct file *file)
|
||||
{
|
||||
int level, status;
|
||||
struct dlmfs_inode_private *ip = DLMFS_I(inode);
|
||||
struct dlmfs_filp_private *fp =
|
||||
(struct dlmfs_filp_private *) file->private_data;
|
||||
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
BUG();
|
||||
|
||||
mlog(0, "close called on inode %lu\n", inode->i_ino);
|
||||
|
||||
status = 0;
|
||||
if (fp) {
|
||||
level = fp->fp_lock_level;
|
||||
if (level != LKM_IVMODE)
|
||||
user_dlm_cluster_unlock(&ip->ip_lockres, level);
|
||||
|
||||
kfree(fp);
|
||||
file->private_data = NULL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t dlmfs_file_read(struct file *filp,
|
||||
char __user *buf,
|
||||
size_t count,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int bytes_left;
|
||||
ssize_t readlen;
|
||||
char *lvb_buf;
|
||||
struct inode *inode = filp->f_dentry->d_inode;
|
||||
|
||||
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
|
||||
inode->i_ino, count, *ppos);
|
||||
|
||||
if (*ppos >= i_size_read(inode))
|
||||
return 0;
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
if (!access_ok(VERIFY_WRITE, buf, count))
|
||||
return -EFAULT;
|
||||
|
||||
/* don't read past the lvb */
|
||||
if ((count + *ppos) > i_size_read(inode))
|
||||
readlen = i_size_read(inode) - *ppos;
|
||||
else
|
||||
readlen = count - *ppos;
|
||||
|
||||
lvb_buf = kmalloc(readlen, GFP_KERNEL);
|
||||
if (!lvb_buf)
|
||||
return -ENOMEM;
|
||||
|
||||
user_dlm_read_lvb(inode, lvb_buf, readlen);
|
||||
bytes_left = __copy_to_user(buf, lvb_buf, readlen);
|
||||
readlen -= bytes_left;
|
||||
|
||||
kfree(lvb_buf);
|
||||
|
||||
*ppos = *ppos + readlen;
|
||||
|
||||
mlog(0, "read %zd bytes\n", readlen);
|
||||
return readlen;
|
||||
}
|
||||
|
||||
static ssize_t dlmfs_file_write(struct file *filp,
|
||||
const char __user *buf,
|
||||
size_t count,
|
||||
loff_t *ppos)
|
||||
{
|
||||
int bytes_left;
|
||||
ssize_t writelen;
|
||||
char *lvb_buf;
|
||||
struct inode *inode = filp->f_dentry->d_inode;
|
||||
|
||||
mlog(0, "inode %lu, count = %zu, *ppos = %llu\n",
|
||||
inode->i_ino, count, *ppos);
|
||||
|
||||
if (*ppos >= i_size_read(inode))
|
||||
return -ENOSPC;
|
||||
|
||||
if (!count)
|
||||
return 0;
|
||||
|
||||
if (!access_ok(VERIFY_READ, buf, count))
|
||||
return -EFAULT;
|
||||
|
||||
/* don't write past the lvb */
|
||||
if ((count + *ppos) > i_size_read(inode))
|
||||
writelen = i_size_read(inode) - *ppos;
|
||||
else
|
||||
writelen = count - *ppos;
|
||||
|
||||
lvb_buf = kmalloc(writelen, GFP_KERNEL);
|
||||
if (!lvb_buf)
|
||||
return -ENOMEM;
|
||||
|
||||
bytes_left = copy_from_user(lvb_buf, buf, writelen);
|
||||
writelen -= bytes_left;
|
||||
if (writelen)
|
||||
user_dlm_write_lvb(inode, lvb_buf, writelen);
|
||||
|
||||
kfree(lvb_buf);
|
||||
|
||||
*ppos = *ppos + writelen;
|
||||
mlog(0, "wrote %zd bytes\n", writelen);
|
||||
return writelen;
|
||||
}
|
||||
|
||||
static void dlmfs_init_once(void *foo,
|
||||
kmem_cache_t *cachep,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct dlmfs_inode_private *ip =
|
||||
(struct dlmfs_inode_private *) foo;
|
||||
|
||||
if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
|
||||
SLAB_CTOR_CONSTRUCTOR) {
|
||||
ip->ip_dlm = NULL;
|
||||
ip->ip_parent = NULL;
|
||||
|
||||
inode_init_once(&ip->ip_vfs_inode);
|
||||
}
|
||||
}
|
||||
|
||||
static struct inode *dlmfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS);
|
||||
if (!ip)
|
||||
return NULL;
|
||||
|
||||
return &ip->ip_vfs_inode;
|
||||
}
|
||||
|
||||
static void dlmfs_destroy_inode(struct inode *inode)
|
||||
{
|
||||
kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode));
|
||||
}
|
||||
|
||||
static void dlmfs_clear_inode(struct inode *inode)
|
||||
{
|
||||
int status;
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
if (!inode)
|
||||
return;
|
||||
|
||||
mlog(0, "inode %lu\n", inode->i_ino);
|
||||
|
||||
ip = DLMFS_I(inode);
|
||||
|
||||
if (S_ISREG(inode->i_mode)) {
|
||||
status = user_dlm_destroy_lock(&ip->ip_lockres);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
iput(ip->ip_parent);
|
||||
goto clear_fields;
|
||||
}
|
||||
|
||||
mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm);
|
||||
/* we must be a directory. If required, lets unregister the
|
||||
* dlm context now. */
|
||||
if (ip->ip_dlm)
|
||||
user_dlm_unregister_context(ip->ip_dlm);
|
||||
clear_fields:
|
||||
ip->ip_parent = NULL;
|
||||
ip->ip_dlm = NULL;
|
||||
}
|
||||
|
||||
static struct backing_dev_info dlmfs_backing_dev_info = {
|
||||
.ra_pages = 0, /* No readahead */
|
||||
.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
|
||||
};
|
||||
|
||||
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
|
||||
{
|
||||
struct inode *inode = new_inode(sb);
|
||||
int mode = S_IFDIR | 0755;
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
if (inode) {
|
||||
ip = DLMFS_I(inode);
|
||||
|
||||
inode->i_mode = mode;
|
||||
inode->i_uid = current->fsuid;
|
||||
inode->i_gid = current->fsgid;
|
||||
inode->i_blksize = PAGE_CACHE_SIZE;
|
||||
inode->i_blocks = 0;
|
||||
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
|
||||
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_nlink++;
|
||||
|
||||
inode->i_fop = &simple_dir_operations;
|
||||
inode->i_op = &dlmfs_root_inode_operations;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static struct inode *dlmfs_get_inode(struct inode *parent,
|
||||
struct dentry *dentry,
|
||||
int mode)
|
||||
{
|
||||
struct super_block *sb = parent->i_sb;
|
||||
struct inode * inode = new_inode(sb);
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
inode->i_mode = mode;
|
||||
inode->i_uid = current->fsuid;
|
||||
inode->i_gid = current->fsgid;
|
||||
inode->i_blksize = PAGE_CACHE_SIZE;
|
||||
inode->i_blocks = 0;
|
||||
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
|
||||
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
|
||||
|
||||
ip = DLMFS_I(inode);
|
||||
ip->ip_dlm = DLMFS_I(parent)->ip_dlm;
|
||||
|
||||
switch (mode & S_IFMT) {
|
||||
default:
|
||||
/* for now we don't support anything other than
|
||||
* directories and regular files. */
|
||||
BUG();
|
||||
break;
|
||||
case S_IFREG:
|
||||
inode->i_op = &dlmfs_file_inode_operations;
|
||||
inode->i_fop = &dlmfs_file_operations;
|
||||
|
||||
i_size_write(inode, DLM_LVB_LEN);
|
||||
|
||||
user_dlm_lock_res_init(&ip->ip_lockres, dentry);
|
||||
|
||||
/* released at clear_inode time, this insures that we
|
||||
* get to drop the dlm reference on each lock *before*
|
||||
* we call the unregister code for releasing parent
|
||||
* directories. */
|
||||
ip->ip_parent = igrab(parent);
|
||||
BUG_ON(!ip->ip_parent);
|
||||
break;
|
||||
case S_IFDIR:
|
||||
inode->i_op = &dlmfs_dir_inode_operations;
|
||||
inode->i_fop = &simple_dir_operations;
|
||||
|
||||
/* directory inodes start off with i_nlink ==
|
||||
* 2 (for "." entry) */
|
||||
inode->i_nlink++;
|
||||
break;
|
||||
}
|
||||
|
||||
if (parent->i_mode & S_ISGID) {
|
||||
inode->i_gid = parent->i_gid;
|
||||
if (S_ISDIR(mode))
|
||||
inode->i_mode |= S_ISGID;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* File creation. Allocate an inode, and we're done..
|
||||
*/
|
||||
/* SMP-safe */
|
||||
static int dlmfs_mkdir(struct inode * dir,
|
||||
struct dentry * dentry,
|
||||
int mode)
|
||||
{
|
||||
int status;
|
||||
struct inode *inode = NULL;
|
||||
struct qstr *domain = &dentry->d_name;
|
||||
struct dlmfs_inode_private *ip;
|
||||
struct dlm_ctxt *dlm;
|
||||
|
||||
mlog(0, "mkdir %.*s\n", domain->len, domain->name);
|
||||
|
||||
/* verify that we have a proper domain */
|
||||
if (domain->len >= O2NM_MAX_NAME_LEN) {
|
||||
status = -EINVAL;
|
||||
mlog(ML_ERROR, "invalid domain name for directory.\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR);
|
||||
if (!inode) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ip = DLMFS_I(inode);
|
||||
|
||||
dlm = user_dlm_register_context(domain);
|
||||
if (IS_ERR(dlm)) {
|
||||
status = PTR_ERR(dlm);
|
||||
mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n",
|
||||
status, domain->len, domain->name);
|
||||
goto bail;
|
||||
}
|
||||
ip->ip_dlm = dlm;
|
||||
|
||||
dir->i_nlink++;
|
||||
d_instantiate(dentry, inode);
|
||||
dget(dentry); /* Extra count - pin the dentry in core */
|
||||
|
||||
status = 0;
|
||||
bail:
|
||||
if (status < 0)
|
||||
iput(inode);
|
||||
return status;
|
||||
}
|
||||
|
||||
static int dlmfs_create(struct inode *dir,
|
||||
struct dentry *dentry,
|
||||
int mode,
|
||||
struct nameidata *nd)
|
||||
{
|
||||
int status = 0;
|
||||
struct inode *inode;
|
||||
struct qstr *name = &dentry->d_name;
|
||||
|
||||
mlog(0, "create %.*s\n", name->len, name->name);
|
||||
|
||||
/* verify name is valid and doesn't contain any dlm reserved
|
||||
* characters */
|
||||
if (name->len >= USER_DLM_LOCK_ID_MAX_LEN ||
|
||||
name->name[0] == '$') {
|
||||
status = -EINVAL;
|
||||
mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len,
|
||||
name->name);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG);
|
||||
if (!inode) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
d_instantiate(dentry, inode);
|
||||
dget(dentry); /* Extra count - pin the dentry in core */
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
static int dlmfs_unlink(struct inode *dir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int status;
|
||||
struct inode *inode = dentry->d_inode;
|
||||
|
||||
mlog(0, "unlink inode %lu\n", inode->i_ino);
|
||||
|
||||
/* if there are no current holders, or none that are waiting
|
||||
* to acquire a lock, this basically destroys our lockres. */
|
||||
status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres);
|
||||
if (status < 0) {
|
||||
mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n",
|
||||
dentry->d_name.len, dentry->d_name.name, status);
|
||||
goto bail;
|
||||
}
|
||||
status = simple_unlink(dir, dentry);
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
static int dlmfs_fill_super(struct super_block * sb,
|
||||
void * data,
|
||||
int silent)
|
||||
{
|
||||
struct inode * inode;
|
||||
struct dentry * root;
|
||||
|
||||
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
||||
sb->s_blocksize = PAGE_CACHE_SIZE;
|
||||
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
|
||||
sb->s_magic = DLMFS_MAGIC;
|
||||
sb->s_op = &dlmfs_ops;
|
||||
inode = dlmfs_get_root_inode(sb);
|
||||
if (!inode)
|
||||
return -ENOMEM;
|
||||
|
||||
root = d_alloc_root(inode);
|
||||
if (!root) {
|
||||
iput(inode);
|
||||
return -ENOMEM;
|
||||
}
|
||||
sb->s_root = root;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct file_operations dlmfs_file_operations = {
|
||||
.open = dlmfs_file_open,
|
||||
.release = dlmfs_file_release,
|
||||
.read = dlmfs_file_read,
|
||||
.write = dlmfs_file_write,
|
||||
};
|
||||
|
||||
static struct inode_operations dlmfs_dir_inode_operations = {
|
||||
.create = dlmfs_create,
|
||||
.lookup = simple_lookup,
|
||||
.unlink = dlmfs_unlink,
|
||||
};
|
||||
|
||||
/* this way we can restrict mkdir to only the toplevel of the fs. */
|
||||
static struct inode_operations dlmfs_root_inode_operations = {
|
||||
.lookup = simple_lookup,
|
||||
.mkdir = dlmfs_mkdir,
|
||||
.rmdir = simple_rmdir,
|
||||
};
|
||||
|
||||
static struct super_operations dlmfs_ops = {
|
||||
.statfs = simple_statfs,
|
||||
.alloc_inode = dlmfs_alloc_inode,
|
||||
.destroy_inode = dlmfs_destroy_inode,
|
||||
.clear_inode = dlmfs_clear_inode,
|
||||
.drop_inode = generic_delete_inode,
|
||||
};
|
||||
|
||||
static struct inode_operations dlmfs_file_inode_operations = {
|
||||
.getattr = simple_getattr,
|
||||
};
|
||||
|
||||
static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type,
|
||||
int flags, const char *dev_name, void *data)
|
||||
{
|
||||
return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super);
|
||||
}
|
||||
|
||||
static struct file_system_type dlmfs_fs_type = {
|
||||
.owner = THIS_MODULE,
|
||||
.name = "ocfs2_dlmfs",
|
||||
.get_sb = dlmfs_get_sb,
|
||||
.kill_sb = kill_litter_super,
|
||||
};
|
||||
|
||||
static int __init init_dlmfs_fs(void)
|
||||
{
|
||||
int status;
|
||||
int cleanup_inode = 0, cleanup_worker = 0;
|
||||
|
||||
dlmfs_print_version();
|
||||
|
||||
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
|
||||
sizeof(struct dlmfs_inode_private),
|
||||
0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
|
||||
dlmfs_init_once, NULL);
|
||||
if (!dlmfs_inode_cache)
|
||||
return -ENOMEM;
|
||||
cleanup_inode = 1;
|
||||
|
||||
user_dlm_worker = create_singlethread_workqueue("user_dlm");
|
||||
if (!user_dlm_worker) {
|
||||
status = -ENOMEM;
|
||||
goto bail;
|
||||
}
|
||||
cleanup_worker = 1;
|
||||
|
||||
status = register_filesystem(&dlmfs_fs_type);
|
||||
bail:
|
||||
if (status) {
|
||||
if (cleanup_inode)
|
||||
kmem_cache_destroy(dlmfs_inode_cache);
|
||||
if (cleanup_worker)
|
||||
destroy_workqueue(user_dlm_worker);
|
||||
} else
|
||||
printk("OCFS2 User DLM kernel interface loaded\n");
|
||||
return status;
|
||||
}
|
||||
|
||||
static void __exit exit_dlmfs_fs(void)
|
||||
{
|
||||
unregister_filesystem(&dlmfs_fs_type);
|
||||
|
||||
flush_workqueue(user_dlm_worker);
|
||||
destroy_workqueue(user_dlm_worker);
|
||||
|
||||
if (kmem_cache_destroy(dlmfs_inode_cache))
|
||||
printk(KERN_INFO "dlmfs_inode_cache: not all structures "
|
||||
"were freed\n");
|
||||
}
|
||||
|
||||
MODULE_AUTHOR("Oracle");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_dlmfs_fs)
|
||||
module_exit(exit_dlmfs_fs)
|
42
fs/ocfs2/dlm/dlmfsver.c
Normal file
42
fs/ocfs2/dlm/dlmfsver.c
Normal file
@ -0,0 +1,42 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmfsver.c
|
||||
*
|
||||
* version string
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "dlmfsver.h"
|
||||
|
||||
#define DLM_BUILD_VERSION "1.3.3"
|
||||
|
||||
#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
|
||||
|
||||
void dlmfs_print_version(void)
|
||||
{
|
||||
printk(KERN_INFO "%s\n", VERSION_STR);
|
||||
}
|
||||
|
||||
MODULE_DESCRIPTION(VERSION_STR);
|
||||
|
||||
MODULE_VERSION(DLM_BUILD_VERSION);
|
31
fs/ocfs2/dlm/dlmfsver.h
Normal file
31
fs/ocfs2/dlm/dlmfsver.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmver.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef DLMFS_VER_H
|
||||
#define DLMFS_VER_H
|
||||
|
||||
void dlmfs_print_version(void);
|
||||
|
||||
#endif /* DLMFS_VER_H */
|
676
fs/ocfs2/dlm/dlmlock.c
Normal file
676
fs/ocfs2/dlm/dlmlock.c
Normal file
@ -0,0 +1,676 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmlock.c
|
||||
*
|
||||
* underlying calls for lock creation
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/delay.h>
|
||||
|
||||
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
#include "dlmcommon.h"
|
||||
|
||||
#include "dlmconvert.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLM
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED;
|
||||
static u64 dlm_next_cookie = 1;
|
||||
|
||||
static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags);
|
||||
static void dlm_init_lock(struct dlm_lock *newlock, int type,
|
||||
u8 node, u64 cookie);
|
||||
static void dlm_lock_release(struct kref *kref);
|
||||
static void dlm_lock_detach_lockres(struct dlm_lock *lock);
|
||||
|
||||
/* Tell us whether we can grant a new lock request.
|
||||
* locking:
|
||||
* caller needs: res->spinlock
|
||||
* taken: none
|
||||
* held on exit: none
|
||||
* returns: 1 if the lock can be granted, 0 otherwise.
|
||||
*/
|
||||
static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
struct list_head *iter;
|
||||
struct dlm_lock *tmplock;
|
||||
|
||||
list_for_each(iter, &res->granted) {
|
||||
tmplock = list_entry(iter, struct dlm_lock, list);
|
||||
|
||||
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
|
||||
return 0;
|
||||
}
|
||||
|
||||
list_for_each(iter, &res->converting) {
|
||||
tmplock = list_entry(iter, struct dlm_lock, list);
|
||||
|
||||
if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* performs lock creation at the lockres master site
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drops res->spinlock
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_NOTQUEUED
|
||||
*/
|
||||
static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags)
|
||||
{
|
||||
int call_ast = 0, kick_thread = 0;
|
||||
enum dlm_status status = DLM_NORMAL;
|
||||
|
||||
mlog_entry("type=%d\n", lock->ml.type);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
/* if called from dlm_create_lock_handler, need to
|
||||
* ensure it will not sleep in dlm_wait_on_lockres */
|
||||
status = __dlm_lockres_state_to_status(res);
|
||||
if (status != DLM_NORMAL &&
|
||||
lock->ml.node != dlm->node_num) {
|
||||
/* erf. state changed after lock was dropped. */
|
||||
spin_unlock(&res->spinlock);
|
||||
dlm_error(status);
|
||||
return status;
|
||||
}
|
||||
__dlm_wait_on_lockres(res);
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
|
||||
if (dlm_can_grant_new_lock(res, lock)) {
|
||||
mlog(0, "I can grant this lock right away\n");
|
||||
/* got it right away */
|
||||
lock->lksb->status = DLM_NORMAL;
|
||||
status = DLM_NORMAL;
|
||||
dlm_lock_get(lock);
|
||||
list_add_tail(&lock->list, &res->granted);
|
||||
|
||||
/* for the recovery lock, we can't allow the ast
|
||||
* to be queued since the dlmthread is already
|
||||
* frozen. but the recovery lock is always locked
|
||||
* with LKM_NOQUEUE so we do not need the ast in
|
||||
* this special case */
|
||||
if (!dlm_is_recovery_lock(res->lockname.name,
|
||||
res->lockname.len)) {
|
||||
kick_thread = 1;
|
||||
call_ast = 1;
|
||||
}
|
||||
} else {
|
||||
/* for NOQUEUE request, unless we get the
|
||||
* lock right away, return DLM_NOTQUEUED */
|
||||
if (flags & LKM_NOQUEUE)
|
||||
status = DLM_NOTQUEUED;
|
||||
else {
|
||||
dlm_lock_get(lock);
|
||||
list_add_tail(&lock->list, &res->blocked);
|
||||
kick_thread = 1;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&res->spinlock);
|
||||
wake_up(&res->wq);
|
||||
|
||||
/* either queue the ast or release it */
|
||||
if (call_ast)
|
||||
dlm_queue_ast(dlm, lock);
|
||||
else
|
||||
dlm_lockres_release_ast(dlm, res);
|
||||
|
||||
dlm_lockres_calc_usage(dlm, res);
|
||||
if (kick_thread)
|
||||
dlm_kick_thread(dlm, res);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void dlm_revert_pending_lock(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
/* remove from local queue if it failed */
|
||||
list_del_init(&lock->list);
|
||||
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drops res->spinlock
|
||||
* held on exit: none
|
||||
* returns: DLM_DENIED, DLM_RECOVERING, or net status
|
||||
*/
|
||||
static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags)
|
||||
{
|
||||
enum dlm_status status = DLM_DENIED;
|
||||
|
||||
mlog_entry("type=%d\n", lock->ml.type);
|
||||
mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
|
||||
res->lockname.name, flags);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
|
||||
/* will exit this call with spinlock held */
|
||||
__dlm_wait_on_lockres(res);
|
||||
res->state |= DLM_LOCK_RES_IN_PROGRESS;
|
||||
|
||||
/* add lock to local (secondary) queue */
|
||||
dlm_lock_get(lock);
|
||||
list_add_tail(&lock->list, &res->blocked);
|
||||
lock->lock_pending = 1;
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
/* spec seems to say that you will get DLM_NORMAL when the lock
|
||||
* has been queued, meaning we need to wait for a reply here. */
|
||||
status = dlm_send_remote_lock_request(dlm, res, lock, flags);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
|
||||
lock->lock_pending = 0;
|
||||
if (status != DLM_NORMAL) {
|
||||
if (status != DLM_NOTQUEUED)
|
||||
dlm_error(status);
|
||||
dlm_revert_pending_lock(res, lock);
|
||||
dlm_lock_put(lock);
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
dlm_lockres_calc_usage(dlm, res);
|
||||
|
||||
wake_up(&res->wq);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
/* for remote lock creation.
|
||||
* locking:
|
||||
* caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS
|
||||
* taken: none
|
||||
* held on exit: none
|
||||
* returns: DLM_NOLOCKMGR, or net status
|
||||
*/
|
||||
static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock, int flags)
|
||||
{
|
||||
struct dlm_create_lock create;
|
||||
int tmpret, status = 0;
|
||||
enum dlm_status ret;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
memset(&create, 0, sizeof(create));
|
||||
create.node_idx = dlm->node_num;
|
||||
create.requested_type = lock->ml.type;
|
||||
create.cookie = lock->ml.cookie;
|
||||
create.namelen = res->lockname.len;
|
||||
create.flags = cpu_to_be32(flags);
|
||||
memcpy(create.name, res->lockname.name, create.namelen);
|
||||
|
||||
tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
|
||||
sizeof(create), res->owner, &status);
|
||||
if (tmpret >= 0) {
|
||||
// successfully sent and received
|
||||
ret = status; // this is already a dlm_status
|
||||
} else {
|
||||
mlog_errno(tmpret);
|
||||
if (dlm_is_host_down(tmpret)) {
|
||||
ret = DLM_RECOVERING;
|
||||
mlog(0, "node %u died so returning DLM_RECOVERING "
|
||||
"from lock message!\n", res->owner);
|
||||
} else {
|
||||
ret = dlm_err_to_dlm_status(tmpret);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void dlm_lock_get(struct dlm_lock *lock)
|
||||
{
|
||||
kref_get(&lock->lock_refs);
|
||||
}
|
||||
|
||||
void dlm_lock_put(struct dlm_lock *lock)
|
||||
{
|
||||
kref_put(&lock->lock_refs, dlm_lock_release);
|
||||
}
|
||||
|
||||
static void dlm_lock_release(struct kref *kref)
|
||||
{
|
||||
struct dlm_lock *lock;
|
||||
|
||||
lock = container_of(kref, struct dlm_lock, lock_refs);
|
||||
|
||||
BUG_ON(!list_empty(&lock->list));
|
||||
BUG_ON(!list_empty(&lock->ast_list));
|
||||
BUG_ON(!list_empty(&lock->bast_list));
|
||||
BUG_ON(lock->ast_pending);
|
||||
BUG_ON(lock->bast_pending);
|
||||
|
||||
dlm_lock_detach_lockres(lock);
|
||||
|
||||
if (lock->lksb_kernel_allocated) {
|
||||
mlog(0, "freeing kernel-allocated lksb\n");
|
||||
kfree(lock->lksb);
|
||||
}
|
||||
kfree(lock);
|
||||
}
|
||||
|
||||
/* associate a lock with it's lockres, getting a ref on the lockres */
|
||||
void dlm_lock_attach_lockres(struct dlm_lock *lock,
|
||||
struct dlm_lock_resource *res)
|
||||
{
|
||||
dlm_lockres_get(res);
|
||||
lock->lockres = res;
|
||||
}
|
||||
|
||||
/* drop ref on lockres, if there is still one associated with lock */
|
||||
static void dlm_lock_detach_lockres(struct dlm_lock *lock)
|
||||
{
|
||||
struct dlm_lock_resource *res;
|
||||
|
||||
res = lock->lockres;
|
||||
if (res) {
|
||||
lock->lockres = NULL;
|
||||
mlog(0, "removing lock's lockres reference\n");
|
||||
dlm_lockres_put(res);
|
||||
}
|
||||
}
|
||||
|
||||
static void dlm_init_lock(struct dlm_lock *newlock, int type,
|
||||
u8 node, u64 cookie)
|
||||
{
|
||||
INIT_LIST_HEAD(&newlock->list);
|
||||
INIT_LIST_HEAD(&newlock->ast_list);
|
||||
INIT_LIST_HEAD(&newlock->bast_list);
|
||||
spin_lock_init(&newlock->spinlock);
|
||||
newlock->ml.type = type;
|
||||
newlock->ml.convert_type = LKM_IVMODE;
|
||||
newlock->ml.highest_blocked = LKM_IVMODE;
|
||||
newlock->ml.node = node;
|
||||
newlock->ml.pad1 = 0;
|
||||
newlock->ml.list = 0;
|
||||
newlock->ml.flags = 0;
|
||||
newlock->ast = NULL;
|
||||
newlock->bast = NULL;
|
||||
newlock->astdata = NULL;
|
||||
newlock->ml.cookie = cpu_to_be64(cookie);
|
||||
newlock->ast_pending = 0;
|
||||
newlock->bast_pending = 0;
|
||||
newlock->convert_pending = 0;
|
||||
newlock->lock_pending = 0;
|
||||
newlock->unlock_pending = 0;
|
||||
newlock->cancel_pending = 0;
|
||||
newlock->lksb_kernel_allocated = 0;
|
||||
|
||||
kref_init(&newlock->lock_refs);
|
||||
}
|
||||
|
||||
struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
|
||||
struct dlm_lockstatus *lksb)
|
||||
{
|
||||
struct dlm_lock *lock;
|
||||
int kernel_allocated = 0;
|
||||
|
||||
lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
|
||||
if (!lock)
|
||||
return NULL;
|
||||
|
||||
if (!lksb) {
|
||||
/* zero memory only if kernel-allocated */
|
||||
lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
|
||||
if (!lksb) {
|
||||
kfree(lock);
|
||||
return NULL;
|
||||
}
|
||||
kernel_allocated = 1;
|
||||
}
|
||||
|
||||
dlm_init_lock(lock, type, node, cookie);
|
||||
if (kernel_allocated)
|
||||
lock->lksb_kernel_allocated = 1;
|
||||
lock->lksb = lksb;
|
||||
lksb->lockid = lock;
|
||||
return lock;
|
||||
}
|
||||
|
||||
/* handler for lock creation net message
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drops res->spinlock
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED
|
||||
*/
|
||||
int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
|
||||
{
|
||||
struct dlm_ctxt *dlm = data;
|
||||
struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf;
|
||||
struct dlm_lock_resource *res = NULL;
|
||||
struct dlm_lock *newlock = NULL;
|
||||
struct dlm_lockstatus *lksb = NULL;
|
||||
enum dlm_status status = DLM_NORMAL;
|
||||
char *name;
|
||||
unsigned int namelen;
|
||||
|
||||
BUG_ON(!dlm);
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
if (!dlm_grab(dlm))
|
||||
return DLM_REJECTED;
|
||||
|
||||
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
|
||||
"Domain %s not fully joined!\n", dlm->name);
|
||||
|
||||
name = create->name;
|
||||
namelen = create->namelen;
|
||||
|
||||
status = DLM_IVBUFLEN;
|
||||
if (namelen > DLM_LOCKID_NAME_MAX) {
|
||||
dlm_error(status);
|
||||
goto leave;
|
||||
}
|
||||
|
||||
status = DLM_SYSERR;
|
||||
newlock = dlm_new_lock(create->requested_type,
|
||||
create->node_idx,
|
||||
be64_to_cpu(create->cookie), NULL);
|
||||
if (!newlock) {
|
||||
dlm_error(status);
|
||||
goto leave;
|
||||
}
|
||||
|
||||
lksb = newlock->lksb;
|
||||
|
||||
if (be32_to_cpu(create->flags) & LKM_GET_LVB) {
|
||||
lksb->flags |= DLM_LKSB_GET_LVB;
|
||||
mlog(0, "set DLM_LKSB_GET_LVB flag\n");
|
||||
}
|
||||
|
||||
status = DLM_IVLOCKID;
|
||||
res = dlm_lookup_lockres(dlm, name, namelen);
|
||||
if (!res) {
|
||||
dlm_error(status);
|
||||
goto leave;
|
||||
}
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
status = __dlm_lockres_state_to_status(res);
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
if (status != DLM_NORMAL) {
|
||||
mlog(0, "lockres recovering/migrating/in-progress\n");
|
||||
goto leave;
|
||||
}
|
||||
|
||||
dlm_lock_attach_lockres(newlock, res);
|
||||
|
||||
status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags));
|
||||
leave:
|
||||
if (status != DLM_NORMAL)
|
||||
if (newlock)
|
||||
dlm_lock_put(newlock);
|
||||
|
||||
if (res)
|
||||
dlm_lockres_put(res);
|
||||
|
||||
dlm_put(dlm);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */
|
||||
static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie)
|
||||
{
|
||||
u64 tmpnode = node_num;
|
||||
|
||||
/* shift single byte of node num into top 8 bits */
|
||||
tmpnode <<= 56;
|
||||
|
||||
spin_lock(&dlm_cookie_lock);
|
||||
*cookie = (dlm_next_cookie | tmpnode);
|
||||
if (++dlm_next_cookie & 0xff00000000000000ull) {
|
||||
mlog(0, "This node's cookie will now wrap!\n");
|
||||
dlm_next_cookie = 1;
|
||||
}
|
||||
spin_unlock(&dlm_cookie_lock);
|
||||
}
|
||||
|
||||
enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode,
|
||||
struct dlm_lockstatus *lksb, int flags,
|
||||
const char *name, dlm_astlockfunc_t *ast, void *data,
|
||||
dlm_bastlockfunc_t *bast)
|
||||
{
|
||||
enum dlm_status status;
|
||||
struct dlm_lock_resource *res = NULL;
|
||||
struct dlm_lock *lock = NULL;
|
||||
int convert = 0, recovery = 0;
|
||||
|
||||
/* yes this function is a mess.
|
||||
* TODO: clean this up. lots of common code in the
|
||||
* lock and convert paths, especially in the retry blocks */
|
||||
if (!lksb) {
|
||||
dlm_error(DLM_BADARGS);
|
||||
return DLM_BADARGS;
|
||||
}
|
||||
|
||||
status = DLM_BADPARAM;
|
||||
if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) {
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (flags & ~LKM_VALID_FLAGS) {
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
|
||||
convert = (flags & LKM_CONVERT);
|
||||
recovery = (flags & LKM_RECOVERY);
|
||||
|
||||
if (recovery &&
|
||||
(!dlm_is_recovery_lock(name, strlen(name)) || convert) ) {
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
if (convert && (flags & LKM_LOCAL)) {
|
||||
mlog(ML_ERROR, "strange LOCAL convert request!\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (convert) {
|
||||
/* CONVERT request */
|
||||
|
||||
/* if converting, must pass in a valid dlm_lock */
|
||||
lock = lksb->lockid;
|
||||
if (!lock) {
|
||||
mlog(ML_ERROR, "NULL lock pointer in convert "
|
||||
"request\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
res = lock->lockres;
|
||||
if (!res) {
|
||||
mlog(ML_ERROR, "NULL lockres pointer in convert "
|
||||
"request\n");
|
||||
goto error;
|
||||
}
|
||||
dlm_lockres_get(res);
|
||||
|
||||
/* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are
|
||||
* static after the original lock call. convert requests will
|
||||
* ensure that everything is the same, or return DLM_BADARGS.
|
||||
* this means that DLM_DENIED_NOASTS will never be returned.
|
||||
*/
|
||||
if (lock->lksb != lksb || lock->ast != ast ||
|
||||
lock->bast != bast || lock->astdata != data) {
|
||||
status = DLM_BADARGS;
|
||||
mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, "
|
||||
"astdata=%p\n", lksb, ast, bast, data);
|
||||
mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, "
|
||||
"astdata=%p\n", lock->lksb, lock->ast,
|
||||
lock->bast, lock->astdata);
|
||||
goto error;
|
||||
}
|
||||
retry_convert:
|
||||
dlm_wait_for_recovery(dlm);
|
||||
|
||||
if (res->owner == dlm->node_num)
|
||||
status = dlmconvert_master(dlm, res, lock, flags, mode);
|
||||
else
|
||||
status = dlmconvert_remote(dlm, res, lock, flags, mode);
|
||||
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
|
||||
status == DLM_FORWARD) {
|
||||
/* for now, see how this works without sleeping
|
||||
* and just retry right away. I suspect the reco
|
||||
* or migration will complete fast enough that
|
||||
* no waiting will be necessary */
|
||||
mlog(0, "retrying convert with migration/recovery/"
|
||||
"in-progress\n");
|
||||
msleep(100);
|
||||
goto retry_convert;
|
||||
}
|
||||
} else {
|
||||
u64 tmpcookie;
|
||||
|
||||
/* LOCK request */
|
||||
status = DLM_BADARGS;
|
||||
if (!name) {
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
|
||||
status = DLM_IVBUFLEN;
|
||||
if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) {
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
|
||||
dlm_get_next_cookie(dlm->node_num, &tmpcookie);
|
||||
lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb);
|
||||
if (!lock) {
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
|
||||
if (!recovery)
|
||||
dlm_wait_for_recovery(dlm);
|
||||
|
||||
/* find or create the lock resource */
|
||||
res = dlm_get_lock_resource(dlm, name, flags);
|
||||
if (!res) {
|
||||
status = DLM_IVLOCKID;
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
|
||||
mlog(0, "type=%d, flags = 0x%x\n", mode, flags);
|
||||
mlog(0, "creating lock: lock=%p res=%p\n", lock, res);
|
||||
|
||||
dlm_lock_attach_lockres(lock, res);
|
||||
lock->ast = ast;
|
||||
lock->bast = bast;
|
||||
lock->astdata = data;
|
||||
|
||||
retry_lock:
|
||||
if (flags & LKM_VALBLK) {
|
||||
mlog(0, "LKM_VALBLK passed by caller\n");
|
||||
|
||||
/* LVB requests for non PR, PW or EX locks are
|
||||
* ignored. */
|
||||
if (mode < LKM_PRMODE)
|
||||
flags &= ~LKM_VALBLK;
|
||||
else {
|
||||
flags |= LKM_GET_LVB;
|
||||
lock->lksb->flags |= DLM_LKSB_GET_LVB;
|
||||
}
|
||||
}
|
||||
|
||||
if (res->owner == dlm->node_num)
|
||||
status = dlmlock_master(dlm, res, lock, flags);
|
||||
else
|
||||
status = dlmlock_remote(dlm, res, lock, flags);
|
||||
|
||||
if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
|
||||
status == DLM_FORWARD) {
|
||||
mlog(0, "retrying lock with migration/"
|
||||
"recovery/in progress\n");
|
||||
msleep(100);
|
||||
dlm_wait_for_recovery(dlm);
|
||||
goto retry_lock;
|
||||
}
|
||||
|
||||
if (status != DLM_NORMAL) {
|
||||
lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
|
||||
if (status != DLM_NOTQUEUED)
|
||||
dlm_error(status);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
error:
|
||||
if (status != DLM_NORMAL) {
|
||||
if (lock && !convert)
|
||||
dlm_lock_put(lock);
|
||||
// this is kind of unnecessary
|
||||
lksb->status = status;
|
||||
}
|
||||
|
||||
/* put lockres ref from the convert path
|
||||
* or from dlm_get_lock_resource */
|
||||
if (res)
|
||||
dlm_lockres_put(res);
|
||||
|
||||
return status;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dlmlock);
|
2664
fs/ocfs2/dlm/dlmmaster.c
Normal file
2664
fs/ocfs2/dlm/dlmmaster.c
Normal file
File diff suppressed because it is too large
Load Diff
2132
fs/ocfs2/dlm/dlmrecovery.c
Normal file
2132
fs/ocfs2/dlm/dlmrecovery.c
Normal file
File diff suppressed because it is too large
Load Diff
692
fs/ocfs2/dlm/dlmthread.c
Normal file
692
fs/ocfs2/dlm/dlmthread.c
Normal file
@ -0,0 +1,692 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmthread.c
|
||||
*
|
||||
* standalone DLM module
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/kthread.h>
|
||||
|
||||
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
#include "dlmcommon.h"
|
||||
#include "dlmdomain.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
static int dlm_thread(void *data);
|
||||
|
||||
static void dlm_flush_asts(struct dlm_ctxt *dlm);
|
||||
|
||||
#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num)
|
||||
|
||||
/* will exit holding res->spinlock, but may drop in function */
|
||||
/* waits until flags are cleared on res->state */
|
||||
void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags)
|
||||
{
|
||||
DECLARE_WAITQUEUE(wait, current);
|
||||
|
||||
assert_spin_locked(&res->spinlock);
|
||||
|
||||
add_wait_queue(&res->wq, &wait);
|
||||
repeat:
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (res->state & flags) {
|
||||
spin_unlock(&res->spinlock);
|
||||
schedule();
|
||||
spin_lock(&res->spinlock);
|
||||
goto repeat;
|
||||
}
|
||||
remove_wait_queue(&res->wq, &wait);
|
||||
current->state = TASK_RUNNING;
|
||||
}
|
||||
|
||||
|
||||
static int __dlm_lockres_unused(struct dlm_lock_resource *res)
|
||||
{
|
||||
if (list_empty(&res->granted) &&
|
||||
list_empty(&res->converting) &&
|
||||
list_empty(&res->blocked) &&
|
||||
list_empty(&res->dirty))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Call whenever you may have added or deleted something from one of
|
||||
* the lockres queue's. This will figure out whether it belongs on the
|
||||
* unused list or not and does the appropriate thing. */
|
||||
void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res)
|
||||
{
|
||||
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
|
||||
|
||||
assert_spin_locked(&dlm->spinlock);
|
||||
assert_spin_locked(&res->spinlock);
|
||||
|
||||
if (__dlm_lockres_unused(res)){
|
||||
if (list_empty(&res->purge)) {
|
||||
mlog(0, "putting lockres %.*s from purge list\n",
|
||||
res->lockname.len, res->lockname.name);
|
||||
|
||||
res->last_used = jiffies;
|
||||
list_add_tail(&res->purge, &dlm->purge_list);
|
||||
dlm->purge_count++;
|
||||
}
|
||||
} else if (!list_empty(&res->purge)) {
|
||||
mlog(0, "removing lockres %.*s from purge list\n",
|
||||
res->lockname.len, res->lockname.name);
|
||||
|
||||
list_del_init(&res->purge);
|
||||
dlm->purge_count--;
|
||||
}
|
||||
}
|
||||
|
||||
void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res)
|
||||
{
|
||||
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
|
||||
spin_lock(&dlm->spinlock);
|
||||
spin_lock(&res->spinlock);
|
||||
|
||||
__dlm_lockres_calc_usage(dlm, res);
|
||||
|
||||
spin_unlock(&res->spinlock);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
}
|
||||
|
||||
/* TODO: Eventual API: Called with the dlm spinlock held, may drop it
|
||||
* to do migration, but will re-acquire before exit. */
|
||||
void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres)
|
||||
{
|
||||
int master;
|
||||
int ret;
|
||||
|
||||
spin_lock(&lockres->spinlock);
|
||||
master = lockres->owner == dlm->node_num;
|
||||
spin_unlock(&lockres->spinlock);
|
||||
|
||||
mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len,
|
||||
lockres->lockname.name, master);
|
||||
|
||||
/* Non master is the easy case -- no migration required, just
|
||||
* quit. */
|
||||
if (!master)
|
||||
goto finish;
|
||||
|
||||
/* Wheee! Migrate lockres here! */
|
||||
spin_unlock(&dlm->spinlock);
|
||||
again:
|
||||
|
||||
ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES);
|
||||
if (ret == -ENOTEMPTY) {
|
||||
mlog(ML_ERROR, "lockres %.*s still has local locks!\n",
|
||||
lockres->lockname.len, lockres->lockname.name);
|
||||
|
||||
BUG();
|
||||
} else if (ret < 0) {
|
||||
mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
|
||||
lockres->lockname.len, lockres->lockname.name);
|
||||
goto again;
|
||||
}
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
|
||||
finish:
|
||||
if (!list_empty(&lockres->purge)) {
|
||||
list_del_init(&lockres->purge);
|
||||
dlm->purge_count--;
|
||||
}
|
||||
__dlm_unhash_lockres(lockres);
|
||||
}
|
||||
|
||||
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
|
||||
int purge_now)
|
||||
{
|
||||
unsigned int run_max, unused;
|
||||
unsigned long purge_jiffies;
|
||||
struct dlm_lock_resource *lockres;
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
run_max = dlm->purge_count;
|
||||
|
||||
while(run_max && !list_empty(&dlm->purge_list)) {
|
||||
run_max--;
|
||||
|
||||
lockres = list_entry(dlm->purge_list.next,
|
||||
struct dlm_lock_resource, purge);
|
||||
|
||||
/* Status of the lockres *might* change so double
|
||||
* check. If the lockres is unused, holding the dlm
|
||||
* spinlock will prevent people from getting and more
|
||||
* refs on it -- there's no need to keep the lockres
|
||||
* spinlock. */
|
||||
spin_lock(&lockres->spinlock);
|
||||
unused = __dlm_lockres_unused(lockres);
|
||||
spin_unlock(&lockres->spinlock);
|
||||
|
||||
if (!unused)
|
||||
continue;
|
||||
|
||||
purge_jiffies = lockres->last_used +
|
||||
msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
|
||||
|
||||
/* Make sure that we want to be processing this guy at
|
||||
* this time. */
|
||||
if (!purge_now && time_after(purge_jiffies, jiffies)) {
|
||||
/* Since resources are added to the purge list
|
||||
* in tail order, we can stop at the first
|
||||
* unpurgable resource -- anyone added after
|
||||
* him will have a greater last_used value */
|
||||
break;
|
||||
}
|
||||
|
||||
list_del_init(&lockres->purge);
|
||||
dlm->purge_count--;
|
||||
|
||||
/* This may drop and reacquire the dlm spinlock if it
|
||||
* has to do migration. */
|
||||
mlog(0, "calling dlm_purge_lockres!\n");
|
||||
dlm_purge_lockres(dlm, lockres);
|
||||
mlog(0, "DONE calling dlm_purge_lockres!\n");
|
||||
|
||||
/* Avoid adding any scheduling latencies */
|
||||
cond_resched_lock(&dlm->spinlock);
|
||||
}
|
||||
|
||||
spin_unlock(&dlm->spinlock);
|
||||
}
|
||||
|
||||
static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res)
|
||||
{
|
||||
struct dlm_lock *lock, *target;
|
||||
struct list_head *iter;
|
||||
struct list_head *head;
|
||||
int can_grant = 1;
|
||||
|
||||
//mlog(0, "res->lockname.len=%d\n", res->lockname.len);
|
||||
//mlog(0, "res->lockname.name=%p\n", res->lockname.name);
|
||||
//mlog(0, "shuffle res %.*s\n", res->lockname.len,
|
||||
// res->lockname.name);
|
||||
|
||||
/* because this function is called with the lockres
|
||||
* spinlock, and because we know that it is not migrating/
|
||||
* recovering/in-progress, it is fine to reserve asts and
|
||||
* basts right before queueing them all throughout */
|
||||
assert_spin_locked(&res->spinlock);
|
||||
BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING|
|
||||
DLM_LOCK_RES_RECOVERING|
|
||||
DLM_LOCK_RES_IN_PROGRESS)));
|
||||
|
||||
converting:
|
||||
if (list_empty(&res->converting))
|
||||
goto blocked;
|
||||
mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len,
|
||||
res->lockname.name);
|
||||
|
||||
target = list_entry(res->converting.next, struct dlm_lock, list);
|
||||
if (target->ml.convert_type == LKM_IVMODE) {
|
||||
mlog(ML_ERROR, "%.*s: converting a lock with no "
|
||||
"convert_type!\n", res->lockname.len, res->lockname.name);
|
||||
BUG();
|
||||
}
|
||||
head = &res->granted;
|
||||
list_for_each(iter, head) {
|
||||
lock = list_entry(iter, struct dlm_lock, list);
|
||||
if (lock==target)
|
||||
continue;
|
||||
if (!dlm_lock_compatible(lock->ml.type,
|
||||
target->ml.convert_type)) {
|
||||
can_grant = 0;
|
||||
/* queue the BAST if not already */
|
||||
if (lock->ml.highest_blocked == LKM_IVMODE) {
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
dlm_queue_bast(dlm, lock);
|
||||
}
|
||||
/* update the highest_blocked if needed */
|
||||
if (lock->ml.highest_blocked < target->ml.convert_type)
|
||||
lock->ml.highest_blocked =
|
||||
target->ml.convert_type;
|
||||
}
|
||||
}
|
||||
head = &res->converting;
|
||||
list_for_each(iter, head) {
|
||||
lock = list_entry(iter, struct dlm_lock, list);
|
||||
if (lock==target)
|
||||
continue;
|
||||
if (!dlm_lock_compatible(lock->ml.type,
|
||||
target->ml.convert_type)) {
|
||||
can_grant = 0;
|
||||
if (lock->ml.highest_blocked == LKM_IVMODE) {
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
dlm_queue_bast(dlm, lock);
|
||||
}
|
||||
if (lock->ml.highest_blocked < target->ml.convert_type)
|
||||
lock->ml.highest_blocked =
|
||||
target->ml.convert_type;
|
||||
}
|
||||
}
|
||||
|
||||
/* we can convert the lock */
|
||||
if (can_grant) {
|
||||
spin_lock(&target->spinlock);
|
||||
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
|
||||
|
||||
mlog(0, "calling ast for converting lock: %.*s, have: %d, "
|
||||
"granting: %d, node: %u\n", res->lockname.len,
|
||||
res->lockname.name, target->ml.type,
|
||||
target->ml.convert_type, target->ml.node);
|
||||
|
||||
target->ml.type = target->ml.convert_type;
|
||||
target->ml.convert_type = LKM_IVMODE;
|
||||
list_del_init(&target->list);
|
||||
list_add_tail(&target->list, &res->granted);
|
||||
|
||||
BUG_ON(!target->lksb);
|
||||
target->lksb->status = DLM_NORMAL;
|
||||
|
||||
spin_unlock(&target->spinlock);
|
||||
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
dlm_queue_ast(dlm, target);
|
||||
/* go back and check for more */
|
||||
goto converting;
|
||||
}
|
||||
|
||||
blocked:
|
||||
if (list_empty(&res->blocked))
|
||||
goto leave;
|
||||
target = list_entry(res->blocked.next, struct dlm_lock, list);
|
||||
|
||||
head = &res->granted;
|
||||
list_for_each(iter, head) {
|
||||
lock = list_entry(iter, struct dlm_lock, list);
|
||||
if (lock==target)
|
||||
continue;
|
||||
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
|
||||
can_grant = 0;
|
||||
if (lock->ml.highest_blocked == LKM_IVMODE) {
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
dlm_queue_bast(dlm, lock);
|
||||
}
|
||||
if (lock->ml.highest_blocked < target->ml.type)
|
||||
lock->ml.highest_blocked = target->ml.type;
|
||||
}
|
||||
}
|
||||
|
||||
head = &res->converting;
|
||||
list_for_each(iter, head) {
|
||||
lock = list_entry(iter, struct dlm_lock, list);
|
||||
if (lock==target)
|
||||
continue;
|
||||
if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
|
||||
can_grant = 0;
|
||||
if (lock->ml.highest_blocked == LKM_IVMODE) {
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
dlm_queue_bast(dlm, lock);
|
||||
}
|
||||
if (lock->ml.highest_blocked < target->ml.type)
|
||||
lock->ml.highest_blocked = target->ml.type;
|
||||
}
|
||||
}
|
||||
|
||||
/* we can grant the blocked lock (only
|
||||
* possible if converting list empty) */
|
||||
if (can_grant) {
|
||||
spin_lock(&target->spinlock);
|
||||
BUG_ON(target->ml.highest_blocked != LKM_IVMODE);
|
||||
|
||||
mlog(0, "calling ast for blocked lock: %.*s, granting: %d, "
|
||||
"node: %u\n", res->lockname.len, res->lockname.name,
|
||||
target->ml.type, target->ml.node);
|
||||
|
||||
// target->ml.type is already correct
|
||||
list_del_init(&target->list);
|
||||
list_add_tail(&target->list, &res->granted);
|
||||
|
||||
BUG_ON(!target->lksb);
|
||||
target->lksb->status = DLM_NORMAL;
|
||||
|
||||
spin_unlock(&target->spinlock);
|
||||
|
||||
__dlm_lockres_reserve_ast(res);
|
||||
dlm_queue_ast(dlm, target);
|
||||
/* go back and check for more */
|
||||
goto converting;
|
||||
}
|
||||
|
||||
leave:
|
||||
return;
|
||||
}
|
||||
|
||||
/* must have NO locks when calling this with res !=NULL * */
|
||||
void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
|
||||
{
|
||||
mlog_entry("dlm=%p, res=%p\n", dlm, res);
|
||||
if (res) {
|
||||
spin_lock(&dlm->spinlock);
|
||||
spin_lock(&res->spinlock);
|
||||
__dlm_dirty_lockres(dlm, res);
|
||||
spin_unlock(&res->spinlock);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
}
|
||||
wake_up(&dlm->dlm_thread_wq);
|
||||
}
|
||||
|
||||
void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
|
||||
{
|
||||
mlog_entry("dlm=%p, res=%p\n", dlm, res);
|
||||
|
||||
assert_spin_locked(&dlm->spinlock);
|
||||
assert_spin_locked(&res->spinlock);
|
||||
|
||||
/* don't shuffle secondary queues */
|
||||
if ((res->owner == dlm->node_num) &&
|
||||
!(res->state & DLM_LOCK_RES_DIRTY)) {
|
||||
list_add_tail(&res->dirty, &dlm->dirty_list);
|
||||
res->state |= DLM_LOCK_RES_DIRTY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Launch the NM thread for the mounted volume */
|
||||
int dlm_launch_thread(struct dlm_ctxt *dlm)
|
||||
{
|
||||
mlog(0, "starting dlm thread...\n");
|
||||
|
||||
dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
|
||||
if (IS_ERR(dlm->dlm_thread_task)) {
|
||||
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
|
||||
dlm->dlm_thread_task = NULL;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dlm_complete_thread(struct dlm_ctxt *dlm)
|
||||
{
|
||||
if (dlm->dlm_thread_task) {
|
||||
mlog(ML_KTHREAD, "waiting for dlm thread to exit\n");
|
||||
kthread_stop(dlm->dlm_thread_task);
|
||||
dlm->dlm_thread_task = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
|
||||
{
|
||||
int empty;
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
empty = list_empty(&dlm->dirty_list);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
|
||||
return empty;
|
||||
}
|
||||
|
||||
static void dlm_flush_asts(struct dlm_ctxt *dlm)
|
||||
{
|
||||
int ret;
|
||||
struct dlm_lock *lock;
|
||||
struct dlm_lock_resource *res;
|
||||
u8 hi;
|
||||
|
||||
spin_lock(&dlm->ast_lock);
|
||||
while (!list_empty(&dlm->pending_asts)) {
|
||||
lock = list_entry(dlm->pending_asts.next,
|
||||
struct dlm_lock, ast_list);
|
||||
/* get an extra ref on lock */
|
||||
dlm_lock_get(lock);
|
||||
res = lock->lockres;
|
||||
mlog(0, "delivering an ast for this lockres\n");
|
||||
|
||||
BUG_ON(!lock->ast_pending);
|
||||
|
||||
/* remove from list (including ref) */
|
||||
list_del_init(&lock->ast_list);
|
||||
dlm_lock_put(lock);
|
||||
spin_unlock(&dlm->ast_lock);
|
||||
|
||||
if (lock->ml.node != dlm->node_num) {
|
||||
ret = dlm_do_remote_ast(dlm, res, lock);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
} else
|
||||
dlm_do_local_ast(dlm, res, lock);
|
||||
|
||||
spin_lock(&dlm->ast_lock);
|
||||
|
||||
/* possible that another ast was queued while
|
||||
* we were delivering the last one */
|
||||
if (!list_empty(&lock->ast_list)) {
|
||||
mlog(0, "aha another ast got queued while "
|
||||
"we were finishing the last one. will "
|
||||
"keep the ast_pending flag set.\n");
|
||||
} else
|
||||
lock->ast_pending = 0;
|
||||
|
||||
/* drop the extra ref.
|
||||
* this may drop it completely. */
|
||||
dlm_lock_put(lock);
|
||||
dlm_lockres_release_ast(dlm, res);
|
||||
}
|
||||
|
||||
while (!list_empty(&dlm->pending_basts)) {
|
||||
lock = list_entry(dlm->pending_basts.next,
|
||||
struct dlm_lock, bast_list);
|
||||
/* get an extra ref on lock */
|
||||
dlm_lock_get(lock);
|
||||
res = lock->lockres;
|
||||
|
||||
BUG_ON(!lock->bast_pending);
|
||||
|
||||
/* get the highest blocked lock, and reset */
|
||||
spin_lock(&lock->spinlock);
|
||||
BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
|
||||
hi = lock->ml.highest_blocked;
|
||||
lock->ml.highest_blocked = LKM_IVMODE;
|
||||
spin_unlock(&lock->spinlock);
|
||||
|
||||
/* remove from list (including ref) */
|
||||
list_del_init(&lock->bast_list);
|
||||
dlm_lock_put(lock);
|
||||
spin_unlock(&dlm->ast_lock);
|
||||
|
||||
mlog(0, "delivering a bast for this lockres "
|
||||
"(blocked = %d\n", hi);
|
||||
|
||||
if (lock->ml.node != dlm->node_num) {
|
||||
ret = dlm_send_proxy_bast(dlm, res, lock, hi);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
} else
|
||||
dlm_do_local_bast(dlm, res, lock, hi);
|
||||
|
||||
spin_lock(&dlm->ast_lock);
|
||||
|
||||
/* possible that another bast was queued while
|
||||
* we were delivering the last one */
|
||||
if (!list_empty(&lock->bast_list)) {
|
||||
mlog(0, "aha another bast got queued while "
|
||||
"we were finishing the last one. will "
|
||||
"keep the bast_pending flag set.\n");
|
||||
} else
|
||||
lock->bast_pending = 0;
|
||||
|
||||
/* drop the extra ref.
|
||||
* this may drop it completely. */
|
||||
dlm_lock_put(lock);
|
||||
dlm_lockres_release_ast(dlm, res);
|
||||
}
|
||||
wake_up(&dlm->ast_wq);
|
||||
spin_unlock(&dlm->ast_lock);
|
||||
}
|
||||
|
||||
|
||||
#define DLM_THREAD_TIMEOUT_MS (4 * 1000)
|
||||
#define DLM_THREAD_MAX_DIRTY 100
|
||||
#define DLM_THREAD_MAX_ASTS 10
|
||||
|
||||
static int dlm_thread(void *data)
|
||||
{
|
||||
struct dlm_lock_resource *res;
|
||||
struct dlm_ctxt *dlm = data;
|
||||
unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS);
|
||||
|
||||
mlog(0, "dlm thread running for %s...\n", dlm->name);
|
||||
|
||||
while (!kthread_should_stop()) {
|
||||
int n = DLM_THREAD_MAX_DIRTY;
|
||||
|
||||
/* dlm_shutting_down is very point-in-time, but that
|
||||
* doesn't matter as we'll just loop back around if we
|
||||
* get false on the leading edge of a state
|
||||
* transition. */
|
||||
dlm_run_purge_list(dlm, dlm_shutting_down(dlm));
|
||||
|
||||
/* We really don't want to hold dlm->spinlock while
|
||||
* calling dlm_shuffle_lists on each lockres that
|
||||
* needs to have its queues adjusted and AST/BASTs
|
||||
* run. So let's pull each entry off the dirty_list
|
||||
* and drop dlm->spinlock ASAP. Once off the list,
|
||||
* res->spinlock needs to be taken again to protect
|
||||
* the queues while calling dlm_shuffle_lists. */
|
||||
spin_lock(&dlm->spinlock);
|
||||
while (!list_empty(&dlm->dirty_list)) {
|
||||
int delay = 0;
|
||||
res = list_entry(dlm->dirty_list.next,
|
||||
struct dlm_lock_resource, dirty);
|
||||
|
||||
/* peel a lockres off, remove it from the list,
|
||||
* unset the dirty flag and drop the dlm lock */
|
||||
BUG_ON(!res);
|
||||
dlm_lockres_get(res);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
res->state &= ~DLM_LOCK_RES_DIRTY;
|
||||
list_del_init(&res->dirty);
|
||||
spin_unlock(&res->spinlock);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
|
||||
/* lockres can be re-dirtied/re-added to the
|
||||
* dirty_list in this gap, but that is ok */
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
if (res->owner != dlm->node_num) {
|
||||
__dlm_print_one_lock_resource(res);
|
||||
mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n",
|
||||
res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no",
|
||||
res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no",
|
||||
res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no",
|
||||
res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
|
||||
}
|
||||
BUG_ON(res->owner != dlm->node_num);
|
||||
|
||||
/* it is now ok to move lockreses in these states
|
||||
* to the dirty list, assuming that they will only be
|
||||
* dirty for a short while. */
|
||||
if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
|
||||
DLM_LOCK_RES_MIGRATING |
|
||||
DLM_LOCK_RES_RECOVERING)) {
|
||||
/* move it to the tail and keep going */
|
||||
spin_unlock(&res->spinlock);
|
||||
mlog(0, "delaying list shuffling for in-"
|
||||
"progress lockres %.*s, state=%d\n",
|
||||
res->lockname.len, res->lockname.name,
|
||||
res->state);
|
||||
delay = 1;
|
||||
goto in_progress;
|
||||
}
|
||||
|
||||
/* at this point the lockres is not migrating/
|
||||
* recovering/in-progress. we have the lockres
|
||||
* spinlock and do NOT have the dlm lock.
|
||||
* safe to reserve/queue asts and run the lists. */
|
||||
|
||||
mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
|
||||
"res=%p\n", dlm, res);
|
||||
|
||||
/* called while holding lockres lock */
|
||||
dlm_shuffle_lists(dlm, res);
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
dlm_lockres_calc_usage(dlm, res);
|
||||
|
||||
in_progress:
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
/* if the lock was in-progress, stick
|
||||
* it on the back of the list */
|
||||
if (delay) {
|
||||
spin_lock(&res->spinlock);
|
||||
list_add_tail(&res->dirty, &dlm->dirty_list);
|
||||
res->state |= DLM_LOCK_RES_DIRTY;
|
||||
spin_unlock(&res->spinlock);
|
||||
}
|
||||
dlm_lockres_put(res);
|
||||
|
||||
/* unlikely, but we may need to give time to
|
||||
* other tasks */
|
||||
if (!--n) {
|
||||
mlog(0, "throttling dlm_thread\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&dlm->spinlock);
|
||||
dlm_flush_asts(dlm);
|
||||
|
||||
/* yield and continue right away if there is more work to do */
|
||||
if (!n) {
|
||||
yield();
|
||||
continue;
|
||||
}
|
||||
|
||||
wait_event_interruptible_timeout(dlm->dlm_thread_wq,
|
||||
!dlm_dirty_list_empty(dlm) ||
|
||||
kthread_should_stop(),
|
||||
timeout);
|
||||
}
|
||||
|
||||
mlog(0, "quitting DLM thread\n");
|
||||
return 0;
|
||||
}
|
672
fs/ocfs2/dlm/dlmunlock.c
Normal file
672
fs/ocfs2/dlm/dlmunlock.c
Normal file
@ -0,0 +1,672 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmunlock.c
|
||||
*
|
||||
* underlying calls for unlocking locks
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/delay.h>
|
||||
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
#include "dlmcommon.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLM
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
#define DLM_UNLOCK_FREE_LOCK 0x00000001
|
||||
#define DLM_UNLOCK_CALL_AST 0x00000002
|
||||
#define DLM_UNLOCK_REMOVE_LOCK 0x00000004
|
||||
#define DLM_UNLOCK_REGRANT_LOCK 0x00000008
|
||||
#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010
|
||||
|
||||
|
||||
static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int *actions);
|
||||
static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int *actions);
|
||||
|
||||
static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags,
|
||||
u8 owner);
|
||||
|
||||
|
||||
/*
|
||||
* according to the spec:
|
||||
* http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf
|
||||
*
|
||||
* flags & LKM_CANCEL != 0: must be converting or blocked
|
||||
* flags & LKM_CANCEL == 0: must be granted
|
||||
*
|
||||
* So to unlock a converting lock, you must first cancel the
|
||||
* convert (passing LKM_CANCEL in flags), then call the unlock
|
||||
* again (with no LKM_CANCEL in flags).
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: res->spinlock and lock->spinlock taken and dropped
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
|
||||
* all callers should have taken an extra ref on lock coming in
|
||||
*/
|
||||
static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags, int *call_ast,
|
||||
int master_node)
|
||||
{
|
||||
enum dlm_status status;
|
||||
int actions = 0;
|
||||
int in_use;
|
||||
u8 owner;
|
||||
|
||||
mlog(0, "master_node = %d, valblk = %d\n", master_node,
|
||||
flags & LKM_VALBLK);
|
||||
|
||||
if (master_node)
|
||||
BUG_ON(res->owner != dlm->node_num);
|
||||
else
|
||||
BUG_ON(res->owner == dlm->node_num);
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
/* We want to be sure that we're not freeing a lock
|
||||
* that still has AST's pending... */
|
||||
in_use = !list_empty(&lock->ast_list);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
if (in_use) {
|
||||
mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
|
||||
"while waiting for an ast!", res->lockname.len,
|
||||
res->lockname.name);
|
||||
return DLM_BADPARAM;
|
||||
}
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
|
||||
if (master_node) {
|
||||
mlog(ML_ERROR, "lockres in progress!\n");
|
||||
spin_unlock(&res->spinlock);
|
||||
return DLM_FORWARD;
|
||||
}
|
||||
/* ok for this to sleep if not in a network handler */
|
||||
__dlm_wait_on_lockres(res);
|
||||
res->state |= DLM_LOCK_RES_IN_PROGRESS;
|
||||
}
|
||||
spin_lock(&lock->spinlock);
|
||||
|
||||
if (res->state & DLM_LOCK_RES_RECOVERING) {
|
||||
status = DLM_RECOVERING;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
|
||||
/* see above for what the spec says about
|
||||
* LKM_CANCEL and the lock queue state */
|
||||
if (flags & LKM_CANCEL)
|
||||
status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions);
|
||||
else
|
||||
status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions);
|
||||
|
||||
if (status != DLM_NORMAL)
|
||||
goto leave;
|
||||
|
||||
/* By now this has been masked out of cancel requests. */
|
||||
if (flags & LKM_VALBLK) {
|
||||
/* make the final update to the lvb */
|
||||
if (master_node)
|
||||
memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
|
||||
else
|
||||
flags |= LKM_PUT_LVB; /* let the send function
|
||||
* handle it. */
|
||||
}
|
||||
|
||||
if (!master_node) {
|
||||
owner = res->owner;
|
||||
/* drop locks and send message */
|
||||
if (flags & LKM_CANCEL)
|
||||
lock->cancel_pending = 1;
|
||||
else
|
||||
lock->unlock_pending = 1;
|
||||
spin_unlock(&lock->spinlock);
|
||||
spin_unlock(&res->spinlock);
|
||||
status = dlm_send_remote_unlock_request(dlm, res, lock, lksb,
|
||||
flags, owner);
|
||||
spin_lock(&res->spinlock);
|
||||
spin_lock(&lock->spinlock);
|
||||
/* if the master told us the lock was already granted,
|
||||
* let the ast handle all of these actions */
|
||||
if (status == DLM_NORMAL &&
|
||||
lksb->status == DLM_CANCELGRANT) {
|
||||
actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
|
||||
DLM_UNLOCK_REGRANT_LOCK|
|
||||
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
|
||||
}
|
||||
if (flags & LKM_CANCEL)
|
||||
lock->cancel_pending = 0;
|
||||
else
|
||||
lock->unlock_pending = 0;
|
||||
|
||||
}
|
||||
|
||||
/* get an extra ref on lock. if we are just switching
|
||||
* lists here, we dont want the lock to go away. */
|
||||
dlm_lock_get(lock);
|
||||
|
||||
if (actions & DLM_UNLOCK_REMOVE_LOCK) {
|
||||
list_del_init(&lock->list);
|
||||
dlm_lock_put(lock);
|
||||
}
|
||||
if (actions & DLM_UNLOCK_REGRANT_LOCK) {
|
||||
dlm_lock_get(lock);
|
||||
list_add_tail(&lock->list, &res->granted);
|
||||
}
|
||||
if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) {
|
||||
mlog(0, "clearing convert_type at %smaster node\n",
|
||||
master_node ? "" : "non-");
|
||||
lock->ml.convert_type = LKM_IVMODE;
|
||||
}
|
||||
|
||||
/* remove the extra ref on lock */
|
||||
dlm_lock_put(lock);
|
||||
|
||||
leave:
|
||||
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
|
||||
if (!dlm_lock_on_list(&res->converting, lock))
|
||||
BUG_ON(lock->ml.convert_type != LKM_IVMODE);
|
||||
else
|
||||
BUG_ON(lock->ml.convert_type == LKM_IVMODE);
|
||||
spin_unlock(&lock->spinlock);
|
||||
spin_unlock(&res->spinlock);
|
||||
wake_up(&res->wq);
|
||||
|
||||
/* let the caller's final dlm_lock_put handle the actual kfree */
|
||||
if (actions & DLM_UNLOCK_FREE_LOCK) {
|
||||
/* this should always be coupled with list removal */
|
||||
BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK));
|
||||
mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n",
|
||||
lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1);
|
||||
dlm_lock_put(lock);
|
||||
}
|
||||
if (actions & DLM_UNLOCK_CALL_AST)
|
||||
*call_ast = 1;
|
||||
|
||||
/* if cancel or unlock succeeded, lvb work is done */
|
||||
if (status == DLM_NORMAL)
|
||||
lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
/* leave DLM_LKSB_PUT_LVB on the lksb so any final
|
||||
* update of the lvb will be sent to the new master */
|
||||
list_del_init(&lock->list);
|
||||
}
|
||||
|
||||
void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock)
|
||||
{
|
||||
list_del_init(&lock->list);
|
||||
list_add_tail(&lock->list, &res->granted);
|
||||
lock->ml.convert_type = LKM_IVMODE;
|
||||
}
|
||||
|
||||
|
||||
static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags,
|
||||
int *call_ast)
|
||||
{
|
||||
return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1);
|
||||
}
|
||||
|
||||
static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags, int *call_ast)
|
||||
{
|
||||
return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: none
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network
|
||||
*/
|
||||
static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int flags,
|
||||
u8 owner)
|
||||
{
|
||||
struct dlm_unlock_lock unlock;
|
||||
int tmpret;
|
||||
enum dlm_status ret;
|
||||
int status = 0;
|
||||
struct kvec vec[2];
|
||||
size_t veclen = 1;
|
||||
|
||||
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
|
||||
|
||||
memset(&unlock, 0, sizeof(unlock));
|
||||
unlock.node_idx = dlm->node_num;
|
||||
unlock.flags = cpu_to_be32(flags);
|
||||
unlock.cookie = lock->ml.cookie;
|
||||
unlock.namelen = res->lockname.len;
|
||||
memcpy(unlock.name, res->lockname.name, unlock.namelen);
|
||||
|
||||
vec[0].iov_len = sizeof(struct dlm_unlock_lock);
|
||||
vec[0].iov_base = &unlock;
|
||||
|
||||
if (flags & LKM_PUT_LVB) {
|
||||
/* extra data to send if we are updating lvb */
|
||||
vec[1].iov_len = DLM_LVB_LEN;
|
||||
vec[1].iov_base = lock->lksb->lvb;
|
||||
veclen++;
|
||||
}
|
||||
|
||||
tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key,
|
||||
vec, veclen, owner, &status);
|
||||
if (tmpret >= 0) {
|
||||
// successfully sent and received
|
||||
if (status == DLM_CANCELGRANT)
|
||||
ret = DLM_NORMAL;
|
||||
else if (status == DLM_FORWARD) {
|
||||
mlog(0, "master was in-progress. retry\n");
|
||||
ret = DLM_FORWARD;
|
||||
} else
|
||||
ret = status;
|
||||
lksb->status = status;
|
||||
} else {
|
||||
mlog_errno(tmpret);
|
||||
if (dlm_is_host_down(tmpret)) {
|
||||
/* NOTE: this seems strange, but it is what we want.
|
||||
* when the master goes down during a cancel or
|
||||
* unlock, the recovery code completes the operation
|
||||
* as if the master had not died, then passes the
|
||||
* updated state to the recovery master. this thread
|
||||
* just needs to finish out the operation and call
|
||||
* the unlockast. */
|
||||
ret = DLM_NORMAL;
|
||||
} else {
|
||||
/* something bad. this will BUG in ocfs2 */
|
||||
ret = dlm_err_to_dlm_status(tmpret);
|
||||
}
|
||||
lksb->status = ret;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* locking:
|
||||
* caller needs: none
|
||||
* taken: takes and drops res->spinlock
|
||||
* held on exit: none
|
||||
* returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID,
|
||||
* return value from dlmunlock_master
|
||||
*/
|
||||
int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data)
|
||||
{
|
||||
struct dlm_ctxt *dlm = data;
|
||||
struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
|
||||
struct dlm_lock_resource *res = NULL;
|
||||
struct list_head *iter;
|
||||
struct dlm_lock *lock = NULL;
|
||||
enum dlm_status status = DLM_NORMAL;
|
||||
int found = 0, i;
|
||||
struct dlm_lockstatus *lksb = NULL;
|
||||
int ignore;
|
||||
u32 flags;
|
||||
struct list_head *queue;
|
||||
|
||||
flags = be32_to_cpu(unlock->flags);
|
||||
|
||||
if (flags & LKM_GET_LVB) {
|
||||
mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n");
|
||||
return DLM_BADARGS;
|
||||
}
|
||||
|
||||
if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) {
|
||||
mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL "
|
||||
"request!\n");
|
||||
return DLM_BADARGS;
|
||||
}
|
||||
|
||||
if (unlock->namelen > DLM_LOCKID_NAME_MAX) {
|
||||
mlog(ML_ERROR, "Invalid name length in unlock handler!\n");
|
||||
return DLM_IVBUFLEN;
|
||||
}
|
||||
|
||||
if (!dlm_grab(dlm))
|
||||
return DLM_REJECTED;
|
||||
|
||||
mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
|
||||
"Domain %s not fully joined!\n", dlm->name);
|
||||
|
||||
mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none");
|
||||
|
||||
res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen);
|
||||
if (!res) {
|
||||
/* We assume here that a no lock resource simply means
|
||||
* it was migrated away and destroyed before the other
|
||||
* node could detect it. */
|
||||
mlog(0, "returning DLM_FORWARD -- res no longer exists\n");
|
||||
status = DLM_FORWARD;
|
||||
goto not_found;
|
||||
}
|
||||
|
||||
queue=&res->granted;
|
||||
found = 0;
|
||||
spin_lock(&res->spinlock);
|
||||
if (res->state & DLM_LOCK_RES_RECOVERING) {
|
||||
spin_unlock(&res->spinlock);
|
||||
mlog(0, "returning DLM_RECOVERING\n");
|
||||
status = DLM_RECOVERING;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
if (res->state & DLM_LOCK_RES_MIGRATING) {
|
||||
spin_unlock(&res->spinlock);
|
||||
mlog(0, "returning DLM_MIGRATING\n");
|
||||
status = DLM_MIGRATING;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
if (res->owner != dlm->node_num) {
|
||||
spin_unlock(&res->spinlock);
|
||||
mlog(0, "returning DLM_FORWARD -- not master\n");
|
||||
status = DLM_FORWARD;
|
||||
goto leave;
|
||||
}
|
||||
|
||||
for (i=0; i<3; i++) {
|
||||
list_for_each(iter, queue) {
|
||||
lock = list_entry(iter, struct dlm_lock, list);
|
||||
if (lock->ml.cookie == unlock->cookie &&
|
||||
lock->ml.node == unlock->node_idx) {
|
||||
dlm_lock_get(lock);
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found)
|
||||
break;
|
||||
/* scan granted -> converting -> blocked queues */
|
||||
queue++;
|
||||
}
|
||||
spin_unlock(&res->spinlock);
|
||||
if (!found) {
|
||||
status = DLM_IVLOCKID;
|
||||
goto not_found;
|
||||
}
|
||||
|
||||
/* lock was found on queue */
|
||||
lksb = lock->lksb;
|
||||
/* unlockast only called on originating node */
|
||||
if (flags & LKM_PUT_LVB) {
|
||||
lksb->flags |= DLM_LKSB_PUT_LVB;
|
||||
memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN);
|
||||
}
|
||||
|
||||
/* if this is in-progress, propagate the DLM_FORWARD
|
||||
* all the way back out */
|
||||
status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore);
|
||||
if (status == DLM_FORWARD)
|
||||
mlog(0, "lockres is in progress\n");
|
||||
|
||||
if (flags & LKM_PUT_LVB)
|
||||
lksb->flags &= ~DLM_LKSB_PUT_LVB;
|
||||
|
||||
dlm_lockres_calc_usage(dlm, res);
|
||||
dlm_kick_thread(dlm, res);
|
||||
|
||||
not_found:
|
||||
if (!found)
|
||||
mlog(ML_ERROR, "failed to find lock to unlock! "
|
||||
"cookie=%"MLFu64"\n",
|
||||
unlock->cookie);
|
||||
else {
|
||||
/* send the lksb->status back to the other node */
|
||||
status = lksb->status;
|
||||
dlm_lock_put(lock);
|
||||
}
|
||||
|
||||
leave:
|
||||
if (res)
|
||||
dlm_lockres_put(res);
|
||||
|
||||
dlm_put(dlm);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int *actions)
|
||||
{
|
||||
enum dlm_status status;
|
||||
|
||||
if (dlm_lock_on_list(&res->blocked, lock)) {
|
||||
/* cancel this outright */
|
||||
lksb->status = DLM_NORMAL;
|
||||
status = DLM_NORMAL;
|
||||
*actions = (DLM_UNLOCK_CALL_AST |
|
||||
DLM_UNLOCK_REMOVE_LOCK);
|
||||
} else if (dlm_lock_on_list(&res->converting, lock)) {
|
||||
/* cancel the request, put back on granted */
|
||||
lksb->status = DLM_NORMAL;
|
||||
status = DLM_NORMAL;
|
||||
*actions = (DLM_UNLOCK_CALL_AST |
|
||||
DLM_UNLOCK_REMOVE_LOCK |
|
||||
DLM_UNLOCK_REGRANT_LOCK |
|
||||
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
|
||||
} else if (dlm_lock_on_list(&res->granted, lock)) {
|
||||
/* too late, already granted. DLM_CANCELGRANT */
|
||||
lksb->status = DLM_CANCELGRANT;
|
||||
status = DLM_NORMAL;
|
||||
*actions = DLM_UNLOCK_CALL_AST;
|
||||
} else {
|
||||
mlog(ML_ERROR, "lock to cancel is not on any list!\n");
|
||||
lksb->status = DLM_IVLOCKID;
|
||||
status = DLM_IVLOCKID;
|
||||
*actions = 0;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm,
|
||||
struct dlm_lock_resource *res,
|
||||
struct dlm_lock *lock,
|
||||
struct dlm_lockstatus *lksb,
|
||||
int *actions)
|
||||
{
|
||||
enum dlm_status status;
|
||||
|
||||
/* unlock request */
|
||||
if (!dlm_lock_on_list(&res->granted, lock)) {
|
||||
lksb->status = DLM_DENIED;
|
||||
status = DLM_DENIED;
|
||||
dlm_error(status);
|
||||
*actions = 0;
|
||||
} else {
|
||||
/* unlock granted lock */
|
||||
lksb->status = DLM_NORMAL;
|
||||
status = DLM_NORMAL;
|
||||
*actions = (DLM_UNLOCK_FREE_LOCK |
|
||||
DLM_UNLOCK_CALL_AST |
|
||||
DLM_UNLOCK_REMOVE_LOCK);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
/* there seems to be no point in doing this async
|
||||
* since (even for the remote case) there is really
|
||||
* no work to queue up... so just do it and fire the
|
||||
* unlockast by hand when done... */
|
||||
enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
|
||||
int flags, dlm_astunlockfunc_t *unlockast, void *data)
|
||||
{
|
||||
enum dlm_status status;
|
||||
struct dlm_lock_resource *res;
|
||||
struct dlm_lock *lock = NULL;
|
||||
int call_ast, is_master;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
if (!lksb) {
|
||||
dlm_error(DLM_BADARGS);
|
||||
return DLM_BADARGS;
|
||||
}
|
||||
|
||||
if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) {
|
||||
dlm_error(DLM_BADPARAM);
|
||||
return DLM_BADPARAM;
|
||||
}
|
||||
|
||||
if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) {
|
||||
mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n");
|
||||
flags &= ~LKM_VALBLK;
|
||||
}
|
||||
|
||||
if (!lksb->lockid || !lksb->lockid->lockres) {
|
||||
dlm_error(DLM_BADPARAM);
|
||||
return DLM_BADPARAM;
|
||||
}
|
||||
|
||||
lock = lksb->lockid;
|
||||
BUG_ON(!lock);
|
||||
dlm_lock_get(lock);
|
||||
|
||||
res = lock->lockres;
|
||||
BUG_ON(!res);
|
||||
dlm_lockres_get(res);
|
||||
retry:
|
||||
call_ast = 0;
|
||||
/* need to retry up here because owner may have changed */
|
||||
mlog(0, "lock=%p res=%p\n", lock, res);
|
||||
|
||||
spin_lock(&res->spinlock);
|
||||
is_master = (res->owner == dlm->node_num);
|
||||
spin_unlock(&res->spinlock);
|
||||
|
||||
if (is_master) {
|
||||
status = dlmunlock_master(dlm, res, lock, lksb, flags,
|
||||
&call_ast);
|
||||
mlog(0, "done calling dlmunlock_master: returned %d, "
|
||||
"call_ast is %d\n", status, call_ast);
|
||||
} else {
|
||||
status = dlmunlock_remote(dlm, res, lock, lksb, flags,
|
||||
&call_ast);
|
||||
mlog(0, "done calling dlmunlock_remote: returned %d, "
|
||||
"call_ast is %d\n", status, call_ast);
|
||||
}
|
||||
|
||||
if (status == DLM_RECOVERING ||
|
||||
status == DLM_MIGRATING ||
|
||||
status == DLM_FORWARD) {
|
||||
/* We want to go away for a tiny bit to allow recovery
|
||||
* / migration to complete on this resource. I don't
|
||||
* know of any wait queue we could sleep on as this
|
||||
* may be happening on another node. Perhaps the
|
||||
* proper solution is to queue up requests on the
|
||||
* other end? */
|
||||
|
||||
/* do we want to yield(); ?? */
|
||||
msleep(50);
|
||||
|
||||
mlog(0, "retrying unlock due to pending recovery/"
|
||||
"migration/in-progress\n");
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (call_ast) {
|
||||
mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status);
|
||||
if (is_master) {
|
||||
/* it is possible that there is one last bast
|
||||
* pending. make sure it is flushed, then
|
||||
* call the unlockast.
|
||||
* not an issue if this is a mastered remotely,
|
||||
* since this lock has been removed from the
|
||||
* lockres queues and cannot be found. */
|
||||
dlm_kick_thread(dlm, NULL);
|
||||
wait_event(dlm->ast_wq,
|
||||
dlm_lock_basts_flushed(dlm, lock));
|
||||
}
|
||||
(*unlockast)(data, lksb->status);
|
||||
}
|
||||
|
||||
if (status == DLM_NORMAL) {
|
||||
mlog(0, "kicking the thread\n");
|
||||
dlm_kick_thread(dlm, res);
|
||||
} else
|
||||
dlm_error(status);
|
||||
|
||||
dlm_lockres_calc_usage(dlm, res);
|
||||
dlm_lockres_put(res);
|
||||
dlm_lock_put(lock);
|
||||
|
||||
mlog(0, "returning status=%d!\n", status);
|
||||
return status;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dlmunlock);
|
||||
|
42
fs/ocfs2/dlm/dlmver.c
Normal file
42
fs/ocfs2/dlm/dlmver.c
Normal file
@ -0,0 +1,42 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmver.c
|
||||
*
|
||||
* version string
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
|
||||
#include "dlmver.h"
|
||||
|
||||
#define DLM_BUILD_VERSION "1.3.3"
|
||||
|
||||
#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
|
||||
|
||||
void dlm_print_version(void)
|
||||
{
|
||||
printk(KERN_INFO "%s\n", VERSION_STR);
|
||||
}
|
||||
|
||||
MODULE_DESCRIPTION(VERSION_STR);
|
||||
|
||||
MODULE_VERSION(DLM_BUILD_VERSION);
|
31
fs/ocfs2/dlm/dlmver.h
Normal file
31
fs/ocfs2/dlm/dlmver.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmfsver.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef DLM_VER_H
|
||||
#define DLM_VER_H
|
||||
|
||||
void dlm_print_version(void);
|
||||
|
||||
#endif /* DLM_VER_H */
|
658
fs/ocfs2/dlm/userdlm.c
Normal file
658
fs/ocfs2/dlm/userdlm.c
Normal file
@ -0,0 +1,658 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* userdlm.c
|
||||
*
|
||||
* Code which implements the kernel side of a minimal userspace
|
||||
* interface to our DLM.
|
||||
*
|
||||
* Many of the functions here are pared down versions of dlmglue.c
|
||||
* functions.
|
||||
*
|
||||
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <asm/signal.h>
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/crc32.h>
|
||||
|
||||
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlmapi.h"
|
||||
|
||||
#include "userdlm.h"
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DLMFS
|
||||
#include "cluster/masklog.h"
|
||||
|
||||
static inline int user_check_wait_flag(struct user_lock_res *lockres,
|
||||
int flag)
|
||||
{
|
||||
int ret;
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
ret = lockres->l_flags & flag;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void user_wait_on_busy_lock(struct user_lock_res *lockres)
|
||||
|
||||
{
|
||||
wait_event(lockres->l_event,
|
||||
!user_check_wait_flag(lockres, USER_LOCK_BUSY));
|
||||
}
|
||||
|
||||
static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres)
|
||||
|
||||
{
|
||||
wait_event(lockres->l_event,
|
||||
!user_check_wait_flag(lockres, USER_LOCK_BLOCKED));
|
||||
}
|
||||
|
||||
/* I heart container_of... */
|
||||
static inline struct dlm_ctxt *
|
||||
dlm_ctxt_from_user_lockres(struct user_lock_res *lockres)
|
||||
{
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
ip = container_of(lockres,
|
||||
struct dlmfs_inode_private,
|
||||
ip_lockres);
|
||||
return ip->ip_dlm;
|
||||
}
|
||||
|
||||
static struct inode *
|
||||
user_dlm_inode_from_user_lockres(struct user_lock_res *lockres)
|
||||
{
|
||||
struct dlmfs_inode_private *ip;
|
||||
|
||||
ip = container_of(lockres,
|
||||
struct dlmfs_inode_private,
|
||||
ip_lockres);
|
||||
return &ip->ip_vfs_inode;
|
||||
}
|
||||
|
||||
static inline void user_recover_from_dlm_error(struct user_lock_res *lockres)
|
||||
{
|
||||
spin_lock(&lockres->l_lock);
|
||||
lockres->l_flags &= ~USER_LOCK_BUSY;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
}
|
||||
|
||||
#define user_log_dlm_error(_func, _stat, _lockres) do { \
|
||||
mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \
|
||||
"resource %s: %s\n", dlm_errname(_stat), _func, \
|
||||
_lockres->l_name, dlm_errmsg(_stat)); \
|
||||
} while (0)
|
||||
|
||||
/* WARNING: This function lives in a world where the only three lock
|
||||
* levels are EX, PR, and NL. It *will* have to be adjusted when more
|
||||
* lock types are added. */
|
||||
static inline int user_highest_compat_lock_level(int level)
|
||||
{
|
||||
int new_level = LKM_EXMODE;
|
||||
|
||||
if (level == LKM_EXMODE)
|
||||
new_level = LKM_NLMODE;
|
||||
else if (level == LKM_PRMODE)
|
||||
new_level = LKM_PRMODE;
|
||||
return new_level;
|
||||
}
|
||||
|
||||
static void user_ast(void *opaque)
|
||||
{
|
||||
struct user_lock_res *lockres = opaque;
|
||||
struct dlm_lockstatus *lksb;
|
||||
|
||||
mlog(0, "AST fired for lockres %s\n", lockres->l_name);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
|
||||
lksb = &(lockres->l_lksb);
|
||||
if (lksb->status != DLM_NORMAL) {
|
||||
mlog(ML_ERROR, "lksb status value of %u on lockres %s\n",
|
||||
lksb->status, lockres->l_name);
|
||||
spin_unlock(&lockres->l_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* we're downconverting. */
|
||||
if (lockres->l_requested < lockres->l_level) {
|
||||
if (lockres->l_requested <=
|
||||
user_highest_compat_lock_level(lockres->l_blocking)) {
|
||||
lockres->l_blocking = LKM_NLMODE;
|
||||
lockres->l_flags &= ~USER_LOCK_BLOCKED;
|
||||
}
|
||||
}
|
||||
|
||||
lockres->l_level = lockres->l_requested;
|
||||
lockres->l_requested = LKM_IVMODE;
|
||||
lockres->l_flags |= USER_LOCK_ATTACHED;
|
||||
lockres->l_flags &= ~USER_LOCK_BUSY;
|
||||
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
wake_up(&lockres->l_event);
|
||||
}
|
||||
|
||||
static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres)
|
||||
{
|
||||
struct inode *inode;
|
||||
inode = user_dlm_inode_from_user_lockres(lockres);
|
||||
if (!igrab(inode))
|
||||
BUG();
|
||||
}
|
||||
|
||||
static void user_dlm_unblock_lock(void *opaque);
|
||||
|
||||
static void __user_dlm_queue_lockres(struct user_lock_res *lockres)
|
||||
{
|
||||
if (!(lockres->l_flags & USER_LOCK_QUEUED)) {
|
||||
user_dlm_grab_inode_ref(lockres);
|
||||
|
||||
INIT_WORK(&lockres->l_work, user_dlm_unblock_lock,
|
||||
lockres);
|
||||
|
||||
queue_work(user_dlm_worker, &lockres->l_work);
|
||||
lockres->l_flags |= USER_LOCK_QUEUED;
|
||||
}
|
||||
}
|
||||
|
||||
static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres)
|
||||
{
|
||||
int queue = 0;
|
||||
|
||||
if (!(lockres->l_flags & USER_LOCK_BLOCKED))
|
||||
return;
|
||||
|
||||
switch (lockres->l_blocking) {
|
||||
case LKM_EXMODE:
|
||||
if (!lockres->l_ex_holders && !lockres->l_ro_holders)
|
||||
queue = 1;
|
||||
break;
|
||||
case LKM_PRMODE:
|
||||
if (!lockres->l_ex_holders)
|
||||
queue = 1;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (queue)
|
||||
__user_dlm_queue_lockres(lockres);
|
||||
}
|
||||
|
||||
static void user_bast(void *opaque, int level)
|
||||
{
|
||||
struct user_lock_res *lockres = opaque;
|
||||
|
||||
mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n",
|
||||
lockres->l_name, level);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
lockres->l_flags |= USER_LOCK_BLOCKED;
|
||||
if (level > lockres->l_blocking)
|
||||
lockres->l_blocking = level;
|
||||
|
||||
__user_dlm_queue_lockres(lockres);
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
wake_up(&lockres->l_event);
|
||||
}
|
||||
|
||||
static void user_unlock_ast(void *opaque, enum dlm_status status)
|
||||
{
|
||||
struct user_lock_res *lockres = opaque;
|
||||
|
||||
mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
|
||||
|
||||
if (status != DLM_NORMAL)
|
||||
mlog(ML_ERROR, "Dlm returns status %d\n", status);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
|
||||
lockres->l_level = LKM_IVMODE;
|
||||
else {
|
||||
lockres->l_requested = LKM_IVMODE; /* cancel an
|
||||
* upconvert
|
||||
* request. */
|
||||
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
|
||||
/* we want the unblock thread to look at it again
|
||||
* now. */
|
||||
__user_dlm_queue_lockres(lockres);
|
||||
}
|
||||
|
||||
lockres->l_flags &= ~USER_LOCK_BUSY;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
wake_up(&lockres->l_event);
|
||||
}
|
||||
|
||||
static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres)
|
||||
{
|
||||
struct inode *inode;
|
||||
inode = user_dlm_inode_from_user_lockres(lockres);
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
static void user_dlm_unblock_lock(void *opaque)
|
||||
{
|
||||
int new_level, status;
|
||||
struct user_lock_res *lockres = (struct user_lock_res *) opaque;
|
||||
struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
|
||||
|
||||
mlog(0, "processing lockres %s\n", lockres->l_name);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
|
||||
BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
|
||||
BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
|
||||
|
||||
/* notice that we don't clear USER_LOCK_BLOCKED here. That's
|
||||
* for user_ast to do. */
|
||||
lockres->l_flags &= ~USER_LOCK_QUEUED;
|
||||
|
||||
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
|
||||
mlog(0, "lock is in teardown so we do nothing\n");
|
||||
spin_unlock(&lockres->l_lock);
|
||||
goto drop_ref;
|
||||
}
|
||||
|
||||
if (lockres->l_flags & USER_LOCK_BUSY) {
|
||||
mlog(0, "BUSY flag detected...\n");
|
||||
if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
|
||||
spin_unlock(&lockres->l_lock);
|
||||
goto drop_ref;
|
||||
}
|
||||
|
||||
lockres->l_flags |= USER_LOCK_IN_CANCEL;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
status = dlmunlock(dlm,
|
||||
&lockres->l_lksb,
|
||||
LKM_CANCEL,
|
||||
user_unlock_ast,
|
||||
lockres);
|
||||
if (status == DLM_CANCELGRANT) {
|
||||
/* If we got this, then the ast was fired
|
||||
* before we could cancel. We cleanup our
|
||||
* state, and restart the function. */
|
||||
spin_lock(&lockres->l_lock);
|
||||
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
} else if (status != DLM_NORMAL)
|
||||
user_log_dlm_error("dlmunlock", status, lockres);
|
||||
goto drop_ref;
|
||||
}
|
||||
|
||||
/* If there are still incompat holders, we can exit safely
|
||||
* without worrying about re-queueing this lock as that will
|
||||
* happen on the last call to user_cluster_unlock. */
|
||||
if ((lockres->l_blocking == LKM_EXMODE)
|
||||
&& (lockres->l_ex_holders || lockres->l_ro_holders)) {
|
||||
spin_unlock(&lockres->l_lock);
|
||||
mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n",
|
||||
lockres->l_ro_holders, lockres->l_ex_holders);
|
||||
goto drop_ref;
|
||||
}
|
||||
|
||||
if ((lockres->l_blocking == LKM_PRMODE)
|
||||
&& lockres->l_ex_holders) {
|
||||
spin_unlock(&lockres->l_lock);
|
||||
mlog(0, "can't downconvert for pr: ex = %u\n",
|
||||
lockres->l_ex_holders);
|
||||
goto drop_ref;
|
||||
}
|
||||
|
||||
/* yay, we can downconvert now. */
|
||||
new_level = user_highest_compat_lock_level(lockres->l_blocking);
|
||||
lockres->l_requested = new_level;
|
||||
lockres->l_flags |= USER_LOCK_BUSY;
|
||||
mlog(0, "Downconvert lock from %d to %d\n",
|
||||
lockres->l_level, new_level);
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
/* need lock downconvert request now... */
|
||||
status = dlmlock(dlm,
|
||||
new_level,
|
||||
&lockres->l_lksb,
|
||||
LKM_CONVERT|LKM_VALBLK,
|
||||
lockres->l_name,
|
||||
user_ast,
|
||||
lockres,
|
||||
user_bast);
|
||||
if (status != DLM_NORMAL) {
|
||||
user_log_dlm_error("dlmlock", status, lockres);
|
||||
user_recover_from_dlm_error(lockres);
|
||||
}
|
||||
|
||||
drop_ref:
|
||||
user_dlm_drop_inode_ref(lockres);
|
||||
}
|
||||
|
||||
static inline void user_dlm_inc_holders(struct user_lock_res *lockres,
|
||||
int level)
|
||||
{
|
||||
switch(level) {
|
||||
case LKM_EXMODE:
|
||||
lockres->l_ex_holders++;
|
||||
break;
|
||||
case LKM_PRMODE:
|
||||
lockres->l_ro_holders++;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/* predict what lock level we'll be dropping down to on behalf
|
||||
* of another node, and return true if the currently wanted
|
||||
* level will be compatible with it. */
|
||||
static inline int
|
||||
user_may_continue_on_blocked_lock(struct user_lock_res *lockres,
|
||||
int wanted)
|
||||
{
|
||||
BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
|
||||
|
||||
return wanted <= user_highest_compat_lock_level(lockres->l_blocking);
|
||||
}
|
||||
|
||||
int user_dlm_cluster_lock(struct user_lock_res *lockres,
|
||||
int level,
|
||||
int lkm_flags)
|
||||
{
|
||||
int status, local_flags;
|
||||
struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
|
||||
|
||||
if (level != LKM_EXMODE &&
|
||||
level != LKM_PRMODE) {
|
||||
mlog(ML_ERROR, "lockres %s: invalid request!\n",
|
||||
lockres->l_name);
|
||||
status = -EINVAL;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n",
|
||||
lockres->l_name,
|
||||
(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE",
|
||||
lkm_flags);
|
||||
|
||||
again:
|
||||
if (signal_pending(current)) {
|
||||
status = -ERESTARTSYS;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
|
||||
/* We only compare against the currently granted level
|
||||
* here. If the lock is blocked waiting on a downconvert,
|
||||
* we'll get caught below. */
|
||||
if ((lockres->l_flags & USER_LOCK_BUSY) &&
|
||||
(level > lockres->l_level)) {
|
||||
/* is someone sitting in dlm_lock? If so, wait on
|
||||
* them. */
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
user_wait_on_busy_lock(lockres);
|
||||
goto again;
|
||||
}
|
||||
|
||||
if ((lockres->l_flags & USER_LOCK_BLOCKED) &&
|
||||
(!user_may_continue_on_blocked_lock(lockres, level))) {
|
||||
/* is the lock is currently blocked on behalf of
|
||||
* another node */
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
user_wait_on_blocked_lock(lockres);
|
||||
goto again;
|
||||
}
|
||||
|
||||
if (level > lockres->l_level) {
|
||||
local_flags = lkm_flags | LKM_VALBLK;
|
||||
if (lockres->l_level != LKM_IVMODE)
|
||||
local_flags |= LKM_CONVERT;
|
||||
|
||||
lockres->l_requested = level;
|
||||
lockres->l_flags |= USER_LOCK_BUSY;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
BUG_ON(level == LKM_IVMODE);
|
||||
BUG_ON(level == LKM_NLMODE);
|
||||
|
||||
mlog(0, "lock %s, get lock from %d to level = %d\n",
|
||||
lockres->l_name, lockres->l_level, level);
|
||||
|
||||
/* call dlm_lock to upgrade lock now */
|
||||
status = dlmlock(dlm,
|
||||
level,
|
||||
&lockres->l_lksb,
|
||||
local_flags,
|
||||
lockres->l_name,
|
||||
user_ast,
|
||||
lockres,
|
||||
user_bast);
|
||||
if (status != DLM_NORMAL) {
|
||||
if ((lkm_flags & LKM_NOQUEUE) &&
|
||||
(status == DLM_NOTQUEUED))
|
||||
status = -EAGAIN;
|
||||
else {
|
||||
user_log_dlm_error("dlmlock", status, lockres);
|
||||
status = -EINVAL;
|
||||
}
|
||||
user_recover_from_dlm_error(lockres);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
mlog(0, "lock %s, successfull return from dlmlock\n",
|
||||
lockres->l_name);
|
||||
|
||||
user_wait_on_busy_lock(lockres);
|
||||
goto again;
|
||||
}
|
||||
|
||||
user_dlm_inc_holders(lockres, level);
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name,
|
||||
(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
|
||||
|
||||
status = 0;
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
static inline void user_dlm_dec_holders(struct user_lock_res *lockres,
|
||||
int level)
|
||||
{
|
||||
switch(level) {
|
||||
case LKM_EXMODE:
|
||||
BUG_ON(!lockres->l_ex_holders);
|
||||
lockres->l_ex_holders--;
|
||||
break;
|
||||
case LKM_PRMODE:
|
||||
BUG_ON(!lockres->l_ro_holders);
|
||||
lockres->l_ro_holders--;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
|
||||
int level)
|
||||
{
|
||||
if (level != LKM_EXMODE &&
|
||||
level != LKM_PRMODE) {
|
||||
mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name);
|
||||
return;
|
||||
}
|
||||
|
||||
mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name,
|
||||
(level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE");
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
user_dlm_dec_holders(lockres, level);
|
||||
__user_dlm_cond_queue_lockres(lockres);
|
||||
spin_unlock(&lockres->l_lock);
|
||||
}
|
||||
|
||||
void user_dlm_write_lvb(struct inode *inode,
|
||||
const char *val,
|
||||
unsigned int len)
|
||||
{
|
||||
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
|
||||
char *lvb = lockres->l_lksb.lvb;
|
||||
|
||||
BUG_ON(len > DLM_LVB_LEN);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
|
||||
BUG_ON(lockres->l_level < LKM_EXMODE);
|
||||
memcpy(lvb, val, len);
|
||||
|
||||
spin_unlock(&lockres->l_lock);
|
||||
}
|
||||
|
||||
void user_dlm_read_lvb(struct inode *inode,
|
||||
char *val,
|
||||
unsigned int len)
|
||||
{
|
||||
struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres;
|
||||
char *lvb = lockres->l_lksb.lvb;
|
||||
|
||||
BUG_ON(len > DLM_LVB_LEN);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
|
||||
BUG_ON(lockres->l_level < LKM_PRMODE);
|
||||
memcpy(val, lvb, len);
|
||||
|
||||
spin_unlock(&lockres->l_lock);
|
||||
}
|
||||
|
||||
void user_dlm_lock_res_init(struct user_lock_res *lockres,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
memset(lockres, 0, sizeof(*lockres));
|
||||
|
||||
spin_lock_init(&lockres->l_lock);
|
||||
init_waitqueue_head(&lockres->l_event);
|
||||
lockres->l_level = LKM_IVMODE;
|
||||
lockres->l_requested = LKM_IVMODE;
|
||||
lockres->l_blocking = LKM_IVMODE;
|
||||
|
||||
/* should have been checked before getting here. */
|
||||
BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN);
|
||||
|
||||
memcpy(lockres->l_name,
|
||||
dentry->d_name.name,
|
||||
dentry->d_name.len);
|
||||
}
|
||||
|
||||
int user_dlm_destroy_lock(struct user_lock_res *lockres)
|
||||
{
|
||||
int status = -EBUSY;
|
||||
struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres);
|
||||
|
||||
mlog(0, "asked to destroy %s\n", lockres->l_name);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
while (lockres->l_flags & USER_LOCK_BUSY) {
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
mlog(0, "lock %s is busy\n", lockres->l_name);
|
||||
|
||||
user_wait_on_busy_lock(lockres);
|
||||
|
||||
spin_lock(&lockres->l_lock);
|
||||
}
|
||||
|
||||
if (lockres->l_ro_holders || lockres->l_ex_holders) {
|
||||
spin_unlock(&lockres->l_lock);
|
||||
mlog(0, "lock %s has holders\n", lockres->l_name);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = 0;
|
||||
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
|
||||
spin_unlock(&lockres->l_lock);
|
||||
mlog(0, "lock %s is not attached\n", lockres->l_name);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
lockres->l_flags &= ~USER_LOCK_ATTACHED;
|
||||
lockres->l_flags |= USER_LOCK_BUSY;
|
||||
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
|
||||
spin_unlock(&lockres->l_lock);
|
||||
|
||||
mlog(0, "unlocking lockres %s\n", lockres->l_name);
|
||||
status = dlmunlock(dlm,
|
||||
&lockres->l_lksb,
|
||||
LKM_VALBLK,
|
||||
user_unlock_ast,
|
||||
lockres);
|
||||
if (status != DLM_NORMAL) {
|
||||
user_log_dlm_error("dlmunlock", status, lockres);
|
||||
status = -EINVAL;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
user_wait_on_busy_lock(lockres);
|
||||
|
||||
status = 0;
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
|
||||
{
|
||||
struct dlm_ctxt *dlm;
|
||||
u32 dlm_key;
|
||||
char *domain;
|
||||
|
||||
domain = kmalloc(name->len + 1, GFP_KERNEL);
|
||||
if (!domain) {
|
||||
mlog_errno(-ENOMEM);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
dlm_key = crc32_le(0, name->name, name->len);
|
||||
|
||||
snprintf(domain, name->len + 1, "%.*s", name->len, name->name);
|
||||
|
||||
dlm = dlm_register_domain(domain, dlm_key);
|
||||
if (IS_ERR(dlm))
|
||||
mlog_errno(PTR_ERR(dlm));
|
||||
|
||||
kfree(domain);
|
||||
return dlm;
|
||||
}
|
||||
|
||||
void user_dlm_unregister_context(struct dlm_ctxt *dlm)
|
||||
{
|
||||
dlm_unregister_domain(dlm);
|
||||
}
|
111
fs/ocfs2/dlm/userdlm.h
Normal file
111
fs/ocfs2/dlm/userdlm.h
Normal file
@ -0,0 +1,111 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* userdlm.h
|
||||
*
|
||||
* Userspace dlm defines
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef USERDLM_H
|
||||
#define USERDLM_H
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
/* user_lock_res->l_flags flags. */
|
||||
#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized
|
||||
* the lvb */
|
||||
#define USER_LOCK_BUSY (0x00000002) /* we are currently in
|
||||
* dlm_lock */
|
||||
#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to
|
||||
* downconvert*/
|
||||
#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently
|
||||
* destroying this
|
||||
* lock. */
|
||||
#define USER_LOCK_QUEUED (0x00000010) /* lock is on the
|
||||
* workqueue */
|
||||
#define USER_LOCK_IN_CANCEL (0x00000020)
|
||||
|
||||
struct user_lock_res {
|
||||
spinlock_t l_lock;
|
||||
|
||||
int l_flags;
|
||||
|
||||
#define USER_DLM_LOCK_ID_MAX_LEN 32
|
||||
char l_name[USER_DLM_LOCK_ID_MAX_LEN];
|
||||
int l_level;
|
||||
unsigned int l_ro_holders;
|
||||
unsigned int l_ex_holders;
|
||||
struct dlm_lockstatus l_lksb;
|
||||
|
||||
int l_requested;
|
||||
int l_blocking;
|
||||
|
||||
wait_queue_head_t l_event;
|
||||
|
||||
struct work_struct l_work;
|
||||
};
|
||||
|
||||
extern struct workqueue_struct *user_dlm_worker;
|
||||
|
||||
void user_dlm_lock_res_init(struct user_lock_res *lockres,
|
||||
struct dentry *dentry);
|
||||
int user_dlm_destroy_lock(struct user_lock_res *lockres);
|
||||
int user_dlm_cluster_lock(struct user_lock_res *lockres,
|
||||
int level,
|
||||
int lkm_flags);
|
||||
void user_dlm_cluster_unlock(struct user_lock_res *lockres,
|
||||
int level);
|
||||
void user_dlm_write_lvb(struct inode *inode,
|
||||
const char *val,
|
||||
unsigned int len);
|
||||
void user_dlm_read_lvb(struct inode *inode,
|
||||
char *val,
|
||||
unsigned int len);
|
||||
struct dlm_ctxt *user_dlm_register_context(struct qstr *name);
|
||||
void user_dlm_unregister_context(struct dlm_ctxt *dlm);
|
||||
|
||||
struct dlmfs_inode_private {
|
||||
struct dlm_ctxt *ip_dlm;
|
||||
|
||||
struct user_lock_res ip_lockres; /* unused for directories. */
|
||||
struct inode *ip_parent;
|
||||
|
||||
struct inode ip_vfs_inode;
|
||||
};
|
||||
|
||||
static inline struct dlmfs_inode_private *
|
||||
DLMFS_I(struct inode *inode)
|
||||
{
|
||||
return container_of(inode,
|
||||
struct dlmfs_inode_private,
|
||||
ip_vfs_inode);
|
||||
}
|
||||
|
||||
struct dlmfs_filp_private {
|
||||
int fp_lock_level;
|
||||
};
|
||||
|
||||
#define DLMFS_MAGIC 0x76a9f425
|
||||
|
||||
#endif /* USERDLM_H */
|
2904
fs/ocfs2/dlmglue.c
Normal file
2904
fs/ocfs2/dlmglue.c
Normal file
File diff suppressed because it is too large
Load Diff
111
fs/ocfs2/dlmglue.h
Normal file
111
fs/ocfs2/dlmglue.h
Normal file
@ -0,0 +1,111 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* dlmglue.h
|
||||
*
|
||||
* description here
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef DLMGLUE_H
|
||||
#define DLMGLUE_H
|
||||
|
||||
#define OCFS2_LVB_VERSION 2
|
||||
|
||||
struct ocfs2_meta_lvb {
|
||||
__be32 lvb_version;
|
||||
__be32 lvb_iclusters;
|
||||
__be32 lvb_iuid;
|
||||
__be32 lvb_igid;
|
||||
__be64 lvb_iatime_packed;
|
||||
__be64 lvb_ictime_packed;
|
||||
__be64 lvb_imtime_packed;
|
||||
__be64 lvb_isize;
|
||||
__be16 lvb_imode;
|
||||
__be16 lvb_inlink;
|
||||
__be32 lvb_reserved[3];
|
||||
};
|
||||
|
||||
/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
|
||||
/* don't wait on recovery. */
|
||||
#define OCFS2_META_LOCK_RECOVERY (0x01)
|
||||
/* Instruct the dlm not to queue ourselves on the other node. */
|
||||
#define OCFS2_META_LOCK_NOQUEUE (0x02)
|
||||
/* don't block waiting for the vote thread, instead return -EAGAIN */
|
||||
#define OCFS2_LOCK_NONBLOCK (0x04)
|
||||
|
||||
int ocfs2_dlm_init(struct ocfs2_super *osb);
|
||||
void ocfs2_dlm_shutdown(struct ocfs2_super *osb);
|
||||
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
|
||||
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
|
||||
enum ocfs2_lock_type type,
|
||||
struct inode *inode);
|
||||
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
|
||||
int ocfs2_create_new_inode_locks(struct inode *inode);
|
||||
int ocfs2_drop_inode_locks(struct inode *inode);
|
||||
int ocfs2_data_lock_full(struct inode *inode,
|
||||
int write,
|
||||
int arg_flags);
|
||||
#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0)
|
||||
int ocfs2_data_lock_with_page(struct inode *inode,
|
||||
int write,
|
||||
struct page *page);
|
||||
void ocfs2_data_unlock(struct inode *inode,
|
||||
int write);
|
||||
int ocfs2_rw_lock(struct inode *inode, int write);
|
||||
void ocfs2_rw_unlock(struct inode *inode, int write);
|
||||
int ocfs2_meta_lock_full(struct inode *inode,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct buffer_head **ret_bh,
|
||||
int ex,
|
||||
int arg_flags);
|
||||
int ocfs2_meta_lock_with_page(struct inode *inode,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct buffer_head **ret_bh,
|
||||
int ex,
|
||||
struct page *page);
|
||||
/* 99% of the time we don't want to supply any additional flags --
|
||||
* those are for very specific cases only. */
|
||||
#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0)
|
||||
void ocfs2_meta_unlock(struct inode *inode,
|
||||
int ex);
|
||||
int ocfs2_super_lock(struct ocfs2_super *osb,
|
||||
int ex);
|
||||
void ocfs2_super_unlock(struct ocfs2_super *osb,
|
||||
int ex);
|
||||
int ocfs2_rename_lock(struct ocfs2_super *osb);
|
||||
void ocfs2_rename_unlock(struct ocfs2_super *osb);
|
||||
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
|
||||
|
||||
/* for the vote thread */
|
||||
void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
|
||||
struct ocfs2_lock_res *lockres);
|
||||
|
||||
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
|
||||
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
|
||||
|
||||
/* aids in debugging and tracking lvbs */
|
||||
void ocfs2_dump_meta_lvb_info(u64 level,
|
||||
const char *function,
|
||||
unsigned int line,
|
||||
struct ocfs2_lock_res *lockres);
|
||||
#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
|
||||
|
||||
#endif /* DLMGLUE_H */
|
45
fs/ocfs2/endian.h
Normal file
45
fs/ocfs2/endian.h
Normal file
@ -0,0 +1,45 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* Copyright (C) 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_ENDIAN_H
|
||||
#define OCFS2_ENDIAN_H
|
||||
|
||||
static inline void le16_add_cpu(__le16 *var, u16 val)
|
||||
{
|
||||
*var = cpu_to_le16(le16_to_cpu(*var) + val);
|
||||
}
|
||||
|
||||
static inline void le32_add_cpu(__le32 *var, u32 val)
|
||||
{
|
||||
*var = cpu_to_le32(le32_to_cpu(*var) + val);
|
||||
}
|
||||
|
||||
static inline void le32_and_cpu(__le32 *var, u32 val)
|
||||
{
|
||||
*var = cpu_to_le32(le32_to_cpu(*var) & val);
|
||||
}
|
||||
|
||||
static inline void be32_add_cpu(__be32 *var, u32 val)
|
||||
{
|
||||
*var = cpu_to_be32(be32_to_cpu(*var) + val);
|
||||
}
|
||||
|
||||
#endif /* OCFS2_ENDIAN_H */
|
248
fs/ocfs2/export.c
Normal file
248
fs/ocfs2/export.c
Normal file
@ -0,0 +1,248 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* export.c
|
||||
*
|
||||
* Functions to facilitate NFS exporting
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_EXPORT
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "dir.h"
|
||||
#include "dlmglue.h"
|
||||
#include "export.h"
|
||||
#include "inode.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
struct ocfs2_inode_handle
|
||||
{
|
||||
u64 ih_blkno;
|
||||
u32 ih_generation;
|
||||
};
|
||||
|
||||
static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp)
|
||||
{
|
||||
struct ocfs2_inode_handle *handle = vobjp;
|
||||
struct inode *inode;
|
||||
struct dentry *result;
|
||||
|
||||
mlog_entry("(0x%p, 0x%p)\n", sb, handle);
|
||||
|
||||
if (handle->ih_blkno == 0) {
|
||||
mlog_errno(-ESTALE);
|
||||
return ERR_PTR(-ESTALE);
|
||||
}
|
||||
|
||||
inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno);
|
||||
|
||||
if (IS_ERR(inode)) {
|
||||
mlog_errno(PTR_ERR(inode));
|
||||
return (void *)inode;
|
||||
}
|
||||
|
||||
if (handle->ih_generation != inode->i_generation) {
|
||||
iput(inode);
|
||||
mlog_errno(-ESTALE);
|
||||
return ERR_PTR(-ESTALE);
|
||||
}
|
||||
|
||||
result = d_alloc_anon(inode);
|
||||
|
||||
if (!result) {
|
||||
iput(inode);
|
||||
mlog_errno(-ENOMEM);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
mlog_exit_ptr(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
static struct dentry *ocfs2_get_parent(struct dentry *child)
|
||||
{
|
||||
int status;
|
||||
u64 blkno;
|
||||
struct dentry *parent;
|
||||
struct inode *inode;
|
||||
struct inode *dir = child->d_inode;
|
||||
struct buffer_head *dirent_bh = NULL;
|
||||
struct ocfs2_dir_entry *dirent;
|
||||
|
||||
mlog_entry("(0x%p, '%.*s')\n", child,
|
||||
child->d_name.len, child->d_name.name);
|
||||
|
||||
mlog(0, "find parent of directory %"MLFu64"\n",
|
||||
OCFS2_I(dir)->ip_blkno);
|
||||
|
||||
status = ocfs2_meta_lock(dir, NULL, NULL, 0);
|
||||
if (status < 0) {
|
||||
if (status != -ENOENT)
|
||||
mlog_errno(status);
|
||||
parent = ERR_PTR(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
|
||||
&dirent);
|
||||
if (status < 0) {
|
||||
parent = ERR_PTR(-ENOENT);
|
||||
goto bail_unlock;
|
||||
}
|
||||
|
||||
inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno);
|
||||
if (IS_ERR(inode)) {
|
||||
mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno);
|
||||
parent = ERR_PTR(-EACCES);
|
||||
goto bail_unlock;
|
||||
}
|
||||
|
||||
parent = d_alloc_anon(inode);
|
||||
if (!parent) {
|
||||
iput(inode);
|
||||
parent = ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
bail_unlock:
|
||||
ocfs2_meta_unlock(dir, 0);
|
||||
|
||||
if (dirent_bh)
|
||||
brelse(dirent_bh);
|
||||
|
||||
bail:
|
||||
mlog_exit_ptr(parent);
|
||||
|
||||
return parent;
|
||||
}
|
||||
|
||||
static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len,
|
||||
int connectable)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
int len = *max_len;
|
||||
int type = 1;
|
||||
u64 blkno;
|
||||
u32 generation;
|
||||
|
||||
mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
|
||||
dentry->d_name.len, dentry->d_name.name,
|
||||
fh, len, connectable);
|
||||
|
||||
if (len < 3 || (connectable && len < 6)) {
|
||||
mlog(ML_ERROR, "fh buffer is too small for encoding\n");
|
||||
type = 255;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
blkno = OCFS2_I(inode)->ip_blkno;
|
||||
generation = inode->i_generation;
|
||||
|
||||
mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
|
||||
blkno, generation);
|
||||
|
||||
len = 3;
|
||||
fh[0] = cpu_to_le32((u32)(blkno >> 32));
|
||||
fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
|
||||
fh[2] = cpu_to_le32(generation);
|
||||
|
||||
if (connectable && !S_ISDIR(inode->i_mode)) {
|
||||
struct inode *parent;
|
||||
|
||||
spin_lock(&dentry->d_lock);
|
||||
|
||||
parent = dentry->d_parent->d_inode;
|
||||
blkno = OCFS2_I(parent)->ip_blkno;
|
||||
generation = parent->i_generation;
|
||||
|
||||
fh[3] = cpu_to_le32((u32)(blkno >> 32));
|
||||
fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
|
||||
fh[5] = cpu_to_le32(generation);
|
||||
|
||||
spin_unlock(&dentry->d_lock);
|
||||
|
||||
len = 6;
|
||||
type = 2;
|
||||
|
||||
mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n",
|
||||
blkno, generation);
|
||||
}
|
||||
|
||||
*max_len = len;
|
||||
|
||||
bail:
|
||||
mlog_exit(type);
|
||||
return type;
|
||||
}
|
||||
|
||||
static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh,
|
||||
int fh_len, int fileid_type,
|
||||
int (*acceptable)(void *context,
|
||||
struct dentry *de),
|
||||
void *context)
|
||||
{
|
||||
struct ocfs2_inode_handle handle, parent;
|
||||
struct dentry *ret = NULL;
|
||||
|
||||
mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n",
|
||||
sb, fh, fh_len, fileid_type, acceptable, context);
|
||||
|
||||
if (fh_len < 3 || fileid_type > 2)
|
||||
goto bail;
|
||||
|
||||
if (fileid_type == 2) {
|
||||
if (fh_len < 6)
|
||||
goto bail;
|
||||
|
||||
parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32;
|
||||
parent.ih_blkno |= (u64)le32_to_cpu(fh[4]);
|
||||
parent.ih_generation = le32_to_cpu(fh[5]);
|
||||
|
||||
mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n",
|
||||
parent.ih_blkno, parent.ih_generation);
|
||||
}
|
||||
|
||||
handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32;
|
||||
handle.ih_blkno |= (u64)le32_to_cpu(fh[1]);
|
||||
handle.ih_generation = le32_to_cpu(fh[2]);
|
||||
|
||||
mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n",
|
||||
handle.ih_blkno, handle.ih_generation);
|
||||
|
||||
ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent,
|
||||
acceptable, context);
|
||||
|
||||
bail:
|
||||
mlog_exit_ptr(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct export_operations ocfs2_export_ops = {
|
||||
.decode_fh = ocfs2_decode_fh,
|
||||
.encode_fh = ocfs2_encode_fh,
|
||||
|
||||
.get_parent = ocfs2_get_parent,
|
||||
.get_dentry = ocfs2_get_dentry,
|
||||
};
|
31
fs/ocfs2/export.h
Normal file
31
fs/ocfs2/export.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* export.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_EXPORT_H
|
||||
#define OCFS2_EXPORT_H
|
||||
|
||||
extern struct export_operations ocfs2_export_ops;
|
||||
|
||||
#endif /* OCFS2_EXPORT_H */
|
994
fs/ocfs2/extent_map.c
Normal file
994
fs/ocfs2/extent_map.c
Normal file
@ -0,0 +1,994 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* extent_map.c
|
||||
*
|
||||
* In-memory extent map for OCFS2. Man, this code was prettier in
|
||||
* the library.
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License, version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_EXTENT_MAP
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "extent_map.h"
|
||||
#include "inode.h"
|
||||
#include "super.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
|
||||
/*
|
||||
* SUCK SUCK SUCK
|
||||
* Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
|
||||
*/
|
||||
|
||||
struct ocfs2_extent_map_entry {
|
||||
struct rb_node e_node;
|
||||
int e_tree_depth;
|
||||
struct ocfs2_extent_rec e_rec;
|
||||
};
|
||||
|
||||
struct ocfs2_em_insert_context {
|
||||
int need_left;
|
||||
int need_right;
|
||||
struct ocfs2_extent_map_entry *new_ent;
|
||||
struct ocfs2_extent_map_entry *old_ent;
|
||||
struct ocfs2_extent_map_entry *left_ent;
|
||||
struct ocfs2_extent_map_entry *right_ent;
|
||||
};
|
||||
|
||||
static kmem_cache_t *ocfs2_em_ent_cachep = NULL;
|
||||
|
||||
|
||||
static struct ocfs2_extent_map_entry *
|
||||
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
|
||||
u32 cpos, u32 clusters,
|
||||
struct rb_node ***ret_p,
|
||||
struct rb_node **ret_parent);
|
||||
static int ocfs2_extent_map_insert(struct inode *inode,
|
||||
struct ocfs2_extent_rec *rec,
|
||||
int tree_depth);
|
||||
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
|
||||
struct ocfs2_extent_map_entry *ent);
|
||||
static int ocfs2_extent_map_find_leaf(struct inode *inode,
|
||||
u32 cpos, u32 clusters,
|
||||
struct ocfs2_extent_list *el);
|
||||
static int ocfs2_extent_map_lookup_read(struct inode *inode,
|
||||
u32 cpos, u32 clusters,
|
||||
struct ocfs2_extent_map_entry **ret_ent);
|
||||
static int ocfs2_extent_map_try_insert(struct inode *inode,
|
||||
struct ocfs2_extent_rec *rec,
|
||||
int tree_depth,
|
||||
struct ocfs2_em_insert_context *ctxt);
|
||||
|
||||
/* returns 1 only if the rec contains all the given clusters -- that is that
|
||||
* rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
|
||||
* clusters) is >= the argument's endpoint */
|
||||
static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
|
||||
u32 cpos, u32 clusters)
|
||||
{
|
||||
if (le32_to_cpu(rec->e_cpos) > cpos)
|
||||
return 0;
|
||||
if (cpos + clusters > le32_to_cpu(rec->e_cpos) +
|
||||
le32_to_cpu(rec->e_clusters))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Find an entry in the tree that intersects the region passed in.
|
||||
* Note that this will find straddled intervals, it is up to the
|
||||
* callers to enforce any boundary conditions.
|
||||
*
|
||||
* Callers must hold ip_lock. This lookup is not guaranteed to return
|
||||
* a tree_depth 0 match, and as such can race inserts if the lock
|
||||
* were not held.
|
||||
*
|
||||
* The rb_node garbage lets insertion share the search. Trivial
|
||||
* callers pass NULL.
|
||||
*/
|
||||
static struct ocfs2_extent_map_entry *
|
||||
ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
|
||||
u32 cpos, u32 clusters,
|
||||
struct rb_node ***ret_p,
|
||||
struct rb_node **ret_parent)
|
||||
{
|
||||
struct rb_node **p = &em->em_extents.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ocfs2_extent_map_entry *ent = NULL;
|
||||
|
||||
while (*p)
|
||||
{
|
||||
parent = *p;
|
||||
ent = rb_entry(parent, struct ocfs2_extent_map_entry,
|
||||
e_node);
|
||||
if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
|
||||
p = &(*p)->rb_left;
|
||||
ent = NULL;
|
||||
} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
|
||||
le32_to_cpu(ent->e_rec.e_clusters))) {
|
||||
p = &(*p)->rb_right;
|
||||
ent = NULL;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
|
||||
if (ret_p != NULL)
|
||||
*ret_p = p;
|
||||
if (ret_parent != NULL)
|
||||
*ret_parent = parent;
|
||||
return ent;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the leaf containing the interval we want. While we're on our
|
||||
* way down the tree, fill in every record we see at any depth, because
|
||||
* we might want it later.
|
||||
*
|
||||
* Note that this code is run without ip_lock. That's because it
|
||||
* sleeps while reading. If someone is also filling the extent list at
|
||||
* the same time we are, we might have to restart.
|
||||
*/
|
||||
static int ocfs2_extent_map_find_leaf(struct inode *inode,
|
||||
u32 cpos, u32 clusters,
|
||||
struct ocfs2_extent_list *el)
|
||||
{
|
||||
int i, ret;
|
||||
struct buffer_head *eb_bh = NULL;
|
||||
u64 blkno;
|
||||
u32 rec_end;
|
||||
struct ocfs2_extent_block *eb;
|
||||
struct ocfs2_extent_rec *rec;
|
||||
|
||||
/*
|
||||
* The bh data containing the el cannot change here, because
|
||||
* we hold alloc_sem. So we can do this without other
|
||||
* locks.
|
||||
*/
|
||||
while (el->l_tree_depth)
|
||||
{
|
||||
blkno = 0;
|
||||
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
|
||||
rec = &el->l_recs[i];
|
||||
rec_end = (le32_to_cpu(rec->e_cpos) +
|
||||
le32_to_cpu(rec->e_clusters));
|
||||
|
||||
ret = -EBADR;
|
||||
if (rec_end > OCFS2_I(inode)->ip_clusters) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (rec_end <= cpos) {
|
||||
ret = ocfs2_extent_map_insert(inode, rec,
|
||||
le16_to_cpu(el->l_tree_depth));
|
||||
if (ret && (ret != -EEXIST)) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
|
||||
ret = ocfs2_extent_map_insert(inode, rec,
|
||||
le16_to_cpu(el->l_tree_depth));
|
||||
if (ret && (ret != -EEXIST)) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We've found a record that matches our
|
||||
* interval. We don't insert it because we're
|
||||
* about to traverse it.
|
||||
*/
|
||||
|
||||
/* Check to see if we're stradling */
|
||||
ret = -ESRCH;
|
||||
if (!ocfs2_extent_rec_contains_clusters(rec,
|
||||
cpos,
|
||||
clusters)) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we've already found a record, the el has
|
||||
* two records covering the same interval.
|
||||
* EEEK!
|
||||
*/
|
||||
ret = -EBADR;
|
||||
if (blkno) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
blkno = le64_to_cpu(rec->e_blkno);
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't support holes, and we're still up
|
||||
* in the branches, so we'd better have found someone
|
||||
*/
|
||||
ret = -EBADR;
|
||||
if (!blkno) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (eb_bh) {
|
||||
brelse(eb_bh);
|
||||
eb_bh = NULL;
|
||||
}
|
||||
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
|
||||
blkno, &eb_bh, OCFS2_BH_CACHED,
|
||||
inode);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
eb = (struct ocfs2_extent_block *)eb_bh->b_data;
|
||||
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
|
||||
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
|
||||
ret = -EIO;
|
||||
goto out_free;
|
||||
}
|
||||
el = &eb->h_list;
|
||||
}
|
||||
|
||||
if (el->l_tree_depth)
|
||||
BUG();
|
||||
|
||||
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
|
||||
rec = &el->l_recs[i];
|
||||
ret = ocfs2_extent_map_insert(inode, rec,
|
||||
le16_to_cpu(el->l_tree_depth));
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
goto out_free;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
out_free:
|
||||
if (eb_bh)
|
||||
brelse(eb_bh);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This lookup actually will read from disk. It has one invariant:
|
||||
* It will never re-traverse blocks. This means that all inserts should
|
||||
* be new regions or more granular regions (both allowed by insert).
|
||||
*/
|
||||
static int ocfs2_extent_map_lookup_read(struct inode *inode,
|
||||
u32 cpos,
|
||||
u32 clusters,
|
||||
struct ocfs2_extent_map_entry **ret_ent)
|
||||
{
|
||||
int ret;
|
||||
u64 blkno;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *ent;
|
||||
struct buffer_head *bh = NULL;
|
||||
struct ocfs2_extent_block *eb;
|
||||
struct ocfs2_dinode *di;
|
||||
struct ocfs2_extent_list *el;
|
||||
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
|
||||
if (ent) {
|
||||
if (!ent->e_tree_depth) {
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
*ret_ent = ent;
|
||||
return 0;
|
||||
}
|
||||
blkno = le64_to_cpu(ent->e_rec.e_blkno);
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
|
||||
OCFS2_BH_CACHED, inode);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
if (bh)
|
||||
brelse(bh);
|
||||
return ret;
|
||||
}
|
||||
eb = (struct ocfs2_extent_block *)bh->b_data;
|
||||
if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
|
||||
OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
|
||||
brelse(bh);
|
||||
return -EIO;
|
||||
}
|
||||
el = &eb->h_list;
|
||||
} else {
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
|
||||
OCFS2_I(inode)->ip_blkno, &bh,
|
||||
OCFS2_BH_CACHED, inode);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
if (bh)
|
||||
brelse(bh);
|
||||
return ret;
|
||||
}
|
||||
di = (struct ocfs2_dinode *)bh->b_data;
|
||||
if (!OCFS2_IS_VALID_DINODE(di)) {
|
||||
brelse(bh);
|
||||
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
|
||||
return -EIO;
|
||||
}
|
||||
el = &di->id2.i_list;
|
||||
}
|
||||
|
||||
ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
|
||||
brelse(bh);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
|
||||
if (!ent) {
|
||||
ret = -ESRCH;
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ent->e_tree_depth)
|
||||
BUG(); /* FIXME: Make sure this isn't a corruption */
|
||||
|
||||
*ret_ent = ent;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callers must hold ip_lock. This can insert pieces of the tree,
|
||||
* thus racing lookup if the lock weren't held.
|
||||
*/
|
||||
static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
|
||||
struct ocfs2_extent_map_entry *ent)
|
||||
{
|
||||
struct rb_node **p, *parent;
|
||||
struct ocfs2_extent_map_entry *old_ent;
|
||||
|
||||
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
|
||||
le32_to_cpu(ent->e_rec.e_clusters),
|
||||
&p, &parent);
|
||||
if (old_ent)
|
||||
return -EEXIST;
|
||||
|
||||
rb_link_node(&ent->e_node, parent, p);
|
||||
rb_insert_color(&ent->e_node, &em->em_extents);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Simple rule: on any return code other than -EAGAIN, anything left
|
||||
* in the insert_context will be freed.
|
||||
*/
|
||||
static int ocfs2_extent_map_try_insert(struct inode *inode,
|
||||
struct ocfs2_extent_rec *rec,
|
||||
int tree_depth,
|
||||
struct ocfs2_em_insert_context *ctxt)
|
||||
{
|
||||
int ret;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *old_ent;
|
||||
|
||||
ctxt->need_left = 0;
|
||||
ctxt->need_right = 0;
|
||||
ctxt->old_ent = NULL;
|
||||
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
|
||||
if (!ret) {
|
||||
ctxt->new_ent = NULL;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
|
||||
le32_to_cpu(rec->e_clusters), NULL,
|
||||
NULL);
|
||||
|
||||
if (!old_ent)
|
||||
BUG();
|
||||
|
||||
ret = -EEXIST;
|
||||
if (old_ent->e_tree_depth < tree_depth)
|
||||
goto out_unlock;
|
||||
|
||||
if (old_ent->e_tree_depth == tree_depth) {
|
||||
if (!memcmp(rec, &old_ent->e_rec,
|
||||
sizeof(struct ocfs2_extent_rec)))
|
||||
ret = 0;
|
||||
|
||||
/* FIXME: Should this be ESRCH/EBADR??? */
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* We do it in this order specifically so that no actual tree
|
||||
* changes occur until we have all the pieces we need. We
|
||||
* don't want malloc failures to leave an inconsistent tree.
|
||||
* Whenever we drop the lock, another process could be
|
||||
* inserting. Also note that, if another process just beat us
|
||||
* to an insert, we might not need the same pieces we needed
|
||||
* the first go round. In the end, the pieces we need will
|
||||
* be used, and the pieces we don't will be freed.
|
||||
*/
|
||||
ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
|
||||
le32_to_cpu(old_ent->e_rec.e_cpos));
|
||||
ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
|
||||
le32_to_cpu(old_ent->e_rec.e_clusters)) >
|
||||
(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
|
||||
ret = -EAGAIN;
|
||||
if (ctxt->need_left) {
|
||||
if (!ctxt->left_ent)
|
||||
goto out_unlock;
|
||||
*(ctxt->left_ent) = *old_ent;
|
||||
ctxt->left_ent->e_rec.e_clusters =
|
||||
cpu_to_le32(le32_to_cpu(rec->e_cpos) -
|
||||
le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
|
||||
}
|
||||
if (ctxt->need_right) {
|
||||
if (!ctxt->right_ent)
|
||||
goto out_unlock;
|
||||
*(ctxt->right_ent) = *old_ent;
|
||||
ctxt->right_ent->e_rec.e_cpos =
|
||||
cpu_to_le32(le32_to_cpu(rec->e_cpos) +
|
||||
le32_to_cpu(rec->e_clusters));
|
||||
ctxt->right_ent->e_rec.e_clusters =
|
||||
cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
|
||||
le32_to_cpu(old_ent->e_rec.e_clusters)) -
|
||||
le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
|
||||
}
|
||||
|
||||
rb_erase(&old_ent->e_node, &em->em_extents);
|
||||
/* Now that he's erased, set him up for deletion */
|
||||
ctxt->old_ent = old_ent;
|
||||
|
||||
if (ctxt->need_left) {
|
||||
ret = ocfs2_extent_map_insert_entry(em,
|
||||
ctxt->left_ent);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
ctxt->left_ent = NULL;
|
||||
}
|
||||
|
||||
if (ctxt->need_right) {
|
||||
ret = ocfs2_extent_map_insert_entry(em,
|
||||
ctxt->right_ent);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
ctxt->right_ent = NULL;
|
||||
}
|
||||
|
||||
ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
|
||||
|
||||
if (!ret)
|
||||
ctxt->new_ent = NULL;
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static int ocfs2_extent_map_insert(struct inode *inode,
|
||||
struct ocfs2_extent_rec *rec,
|
||||
int tree_depth)
|
||||
{
|
||||
int ret;
|
||||
struct ocfs2_em_insert_context ctxt = {0, };
|
||||
|
||||
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
|
||||
OCFS2_I(inode)->ip_map.em_clusters) {
|
||||
ret = -EBADR;
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Zero e_clusters means a truncated tail record. It better be EOF */
|
||||
if (!rec->e_clusters) {
|
||||
if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
|
||||
OCFS2_I(inode)->ip_map.em_clusters) {
|
||||
ret = -EBADR;
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Ignore the truncated tail */
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = -ENOMEM;
|
||||
ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
|
||||
GFP_KERNEL);
|
||||
if (!ctxt.new_ent) {
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
ctxt.new_ent->e_rec = *rec;
|
||||
ctxt.new_ent->e_tree_depth = tree_depth;
|
||||
|
||||
do {
|
||||
ret = -ENOMEM;
|
||||
if (ctxt.need_left && !ctxt.left_ent) {
|
||||
ctxt.left_ent =
|
||||
kmem_cache_alloc(ocfs2_em_ent_cachep,
|
||||
GFP_KERNEL);
|
||||
if (!ctxt.left_ent)
|
||||
break;
|
||||
}
|
||||
if (ctxt.need_right && !ctxt.right_ent) {
|
||||
ctxt.right_ent =
|
||||
kmem_cache_alloc(ocfs2_em_ent_cachep,
|
||||
GFP_KERNEL);
|
||||
if (!ctxt.right_ent)
|
||||
break;
|
||||
}
|
||||
|
||||
ret = ocfs2_extent_map_try_insert(inode, rec,
|
||||
tree_depth, &ctxt);
|
||||
} while (ret == -EAGAIN);
|
||||
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
|
||||
if (ctxt.left_ent)
|
||||
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
|
||||
if (ctxt.right_ent)
|
||||
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
|
||||
if (ctxt.old_ent)
|
||||
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
|
||||
if (ctxt.new_ent)
|
||||
kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append this record to the tail of the extent map. It must be
|
||||
* tree_depth 0. The record might be an extension of an existing
|
||||
* record, and as such that needs to be handled. eg:
|
||||
*
|
||||
* Existing record in the extent map:
|
||||
*
|
||||
* cpos = 10, len = 10
|
||||
* |---------|
|
||||
*
|
||||
* New Record:
|
||||
*
|
||||
* cpos = 10, len = 20
|
||||
* |------------------|
|
||||
*
|
||||
* The passed record is the new on-disk record. The new_clusters value
|
||||
* is how many clusters were added to the file. If the append is a
|
||||
* contiguous append, the new_clusters has been added to
|
||||
* rec->e_clusters. If the append is an entirely new extent, then
|
||||
* rec->e_clusters is == new_clusters.
|
||||
*/
|
||||
int ocfs2_extent_map_append(struct inode *inode,
|
||||
struct ocfs2_extent_rec *rec,
|
||||
u32 new_clusters)
|
||||
{
|
||||
int ret;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *ent;
|
||||
struct ocfs2_extent_rec *old;
|
||||
|
||||
BUG_ON(!new_clusters);
|
||||
BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
|
||||
|
||||
if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
|
||||
/*
|
||||
* Size changed underneath us on disk. Drop any
|
||||
* straddling records and update our idea of
|
||||
* i_clusters
|
||||
*/
|
||||
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
|
||||
em->em_clusters = OCFS2_I(inode)->ip_clusters;
|
||||
}
|
||||
|
||||
mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
|
||||
le32_to_cpu(rec->e_clusters)) !=
|
||||
(em->em_clusters + new_clusters),
|
||||
"Inode %"MLFu64":\n"
|
||||
"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
|
||||
"em->em_clusters = %u + new_clusters = %u = %u\n",
|
||||
OCFS2_I(inode)->ip_blkno,
|
||||
le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
|
||||
le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
|
||||
em->em_clusters, new_clusters,
|
||||
em->em_clusters + new_clusters);
|
||||
|
||||
em->em_clusters += new_clusters;
|
||||
|
||||
ret = -ENOENT;
|
||||
if (le32_to_cpu(rec->e_clusters) > new_clusters) {
|
||||
/* This is a contiguous append */
|
||||
ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
|
||||
NULL, NULL);
|
||||
if (ent) {
|
||||
old = &ent->e_rec;
|
||||
BUG_ON((le32_to_cpu(rec->e_cpos) +
|
||||
le32_to_cpu(rec->e_clusters)) !=
|
||||
(le32_to_cpu(old->e_cpos) +
|
||||
le32_to_cpu(old->e_clusters) +
|
||||
new_clusters));
|
||||
if (ent->e_tree_depth == 0) {
|
||||
BUG_ON(le32_to_cpu(old->e_cpos) !=
|
||||
le32_to_cpu(rec->e_cpos));
|
||||
BUG_ON(le64_to_cpu(old->e_blkno) !=
|
||||
le64_to_cpu(rec->e_blkno));
|
||||
ret = 0;
|
||||
}
|
||||
/*
|
||||
* Let non-leafs fall through as -ENOENT to
|
||||
* force insertion of the new leaf.
|
||||
*/
|
||||
le32_add_cpu(&old->e_clusters, new_clusters);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret == -ENOENT)
|
||||
ret = ocfs2_extent_map_insert(inode, rec, 0);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Code here is included but defined out as it completes the extent
|
||||
* map api and may be used in the future. */
|
||||
|
||||
/*
|
||||
* Look up the record containing this cluster offset. This record is
|
||||
* part of the extent map. Do not free it. Any changes you make to
|
||||
* it will reflect in the extent map. So, if your last extent
|
||||
* is (cpos = 10, clusters = 10) and you truncate the file by 5
|
||||
* clusters, you can do:
|
||||
*
|
||||
* ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
|
||||
* rec->e_clusters -= 5;
|
||||
*
|
||||
* The lookup does not read from disk. If the map isn't filled in for
|
||||
* an entry, you won't find it.
|
||||
*
|
||||
* Also note that the returned record is valid until alloc_sem is
|
||||
* dropped. After that, truncate and extend can happen. Caveat Emptor.
|
||||
*/
|
||||
int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
|
||||
struct ocfs2_extent_rec **rec,
|
||||
int *tree_depth)
|
||||
{
|
||||
int ret = -ENOENT;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *ent;
|
||||
|
||||
*rec = NULL;
|
||||
|
||||
if (cpos >= OCFS2_I(inode)->ip_clusters)
|
||||
return -EINVAL;
|
||||
|
||||
if (cpos >= em->em_clusters) {
|
||||
/*
|
||||
* Size changed underneath us on disk. Drop any
|
||||
* straddling records and update our idea of
|
||||
* i_clusters
|
||||
*/
|
||||
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
|
||||
em->em_clusters = OCFS2_I(inode)->ip_clusters ;
|
||||
}
|
||||
|
||||
ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
|
||||
NULL, NULL);
|
||||
|
||||
if (ent) {
|
||||
*rec = &ent->e_rec;
|
||||
if (tree_depth)
|
||||
*tree_depth = ent->e_tree_depth;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ocfs2_extent_map_get_clusters(struct inode *inode,
|
||||
u32 v_cpos, int count,
|
||||
u32 *p_cpos, int *ret_count)
|
||||
{
|
||||
int ret;
|
||||
u32 coff, ccount;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *ent = NULL;
|
||||
|
||||
*p_cpos = ccount = 0;
|
||||
|
||||
if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
|
||||
return -EINVAL;
|
||||
|
||||
if ((v_cpos + count) > em->em_clusters) {
|
||||
/*
|
||||
* Size changed underneath us on disk. Drop any
|
||||
* straddling records and update our idea of
|
||||
* i_clusters
|
||||
*/
|
||||
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
|
||||
em->em_clusters = OCFS2_I(inode)->ip_clusters;
|
||||
}
|
||||
|
||||
|
||||
ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (ent) {
|
||||
/* We should never find ourselves straddling an interval */
|
||||
if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
|
||||
v_cpos,
|
||||
count))
|
||||
return -ESRCH;
|
||||
|
||||
coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
|
||||
*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
|
||||
le64_to_cpu(ent->e_rec.e_blkno)) +
|
||||
coff;
|
||||
|
||||
if (ret_count)
|
||||
*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
#endif /* 0 */
|
||||
|
||||
int ocfs2_extent_map_get_blocks(struct inode *inode,
|
||||
u64 v_blkno, int count,
|
||||
u64 *p_blkno, int *ret_count)
|
||||
{
|
||||
int ret;
|
||||
u64 boff;
|
||||
u32 cpos, clusters;
|
||||
int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
|
||||
struct ocfs2_extent_map_entry *ent = NULL;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_rec *rec;
|
||||
|
||||
*p_blkno = 0;
|
||||
|
||||
cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
|
||||
clusters = ocfs2_blocks_to_clusters(inode->i_sb,
|
||||
(u64)count + bpc - 1);
|
||||
if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
|
||||
ret = -EINVAL;
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if ((cpos + clusters) > em->em_clusters) {
|
||||
/*
|
||||
* Size changed underneath us on disk. Drop any
|
||||
* straddling records and update our idea of
|
||||
* i_clusters
|
||||
*/
|
||||
ocfs2_extent_map_drop(inode, em->em_clusters - 1);
|
||||
em->em_clusters = OCFS2_I(inode)->ip_clusters;
|
||||
}
|
||||
|
||||
ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
|
||||
if (ret) {
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ent)
|
||||
{
|
||||
rec = &ent->e_rec;
|
||||
|
||||
/* We should never find ourselves straddling an interval */
|
||||
if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
|
||||
ret = -ESRCH;
|
||||
mlog_errno(ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
|
||||
le32_to_cpu(rec->e_cpos));
|
||||
boff += (v_blkno & (u64)(bpc - 1));
|
||||
*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
|
||||
|
||||
if (ret_count) {
|
||||
*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
|
||||
le32_to_cpu(rec->e_clusters)) - boff;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
int ocfs2_extent_map_init(struct inode *inode)
|
||||
{
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
|
||||
em->em_extents = RB_ROOT;
|
||||
em->em_clusters = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Needs the lock */
|
||||
static void __ocfs2_extent_map_drop(struct inode *inode,
|
||||
u32 new_clusters,
|
||||
struct rb_node **free_head,
|
||||
struct ocfs2_extent_map_entry **tail_ent)
|
||||
{
|
||||
struct rb_node *node, *next;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *ent;
|
||||
|
||||
*free_head = NULL;
|
||||
|
||||
ent = NULL;
|
||||
node = rb_last(&em->em_extents);
|
||||
while (node)
|
||||
{
|
||||
next = rb_prev(node);
|
||||
|
||||
ent = rb_entry(node, struct ocfs2_extent_map_entry,
|
||||
e_node);
|
||||
if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
|
||||
break;
|
||||
|
||||
rb_erase(&ent->e_node, &em->em_extents);
|
||||
|
||||
node->rb_right = *free_head;
|
||||
*free_head = node;
|
||||
|
||||
ent = NULL;
|
||||
node = next;
|
||||
}
|
||||
|
||||
/* Do we have an entry straddling new_clusters? */
|
||||
if (tail_ent) {
|
||||
if (ent &&
|
||||
((le32_to_cpu(ent->e_rec.e_cpos) +
|
||||
le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
|
||||
*tail_ent = ent;
|
||||
else
|
||||
*tail_ent = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
|
||||
{
|
||||
struct rb_node *node;
|
||||
struct ocfs2_extent_map_entry *ent;
|
||||
|
||||
while (free_head) {
|
||||
node = free_head;
|
||||
free_head = node->rb_right;
|
||||
|
||||
ent = rb_entry(node, struct ocfs2_extent_map_entry,
|
||||
e_node);
|
||||
kmem_cache_free(ocfs2_em_ent_cachep, ent);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all entries past new_clusters, inclusive of an entry that
|
||||
* contains new_clusters. This is effectively a cache forget.
|
||||
*
|
||||
* If you want to also clip the last extent by some number of clusters,
|
||||
* you need to call ocfs2_extent_map_trunc().
|
||||
* This code does not check or modify ip_clusters.
|
||||
*/
|
||||
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
|
||||
{
|
||||
struct rb_node *free_head = NULL;
|
||||
struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
|
||||
struct ocfs2_extent_map_entry *ent;
|
||||
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
|
||||
|
||||
if (ent) {
|
||||
rb_erase(&ent->e_node, &em->em_extents);
|
||||
ent->e_node.rb_right = free_head;
|
||||
free_head = &ent->e_node;
|
||||
}
|
||||
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
if (free_head)
|
||||
__ocfs2_extent_map_drop_cleanup(free_head);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all entries past new_clusters and also clip any extent
|
||||
* straddling new_clusters, if there is one. This does not check
|
||||
* or modify ip_clusters
|
||||
*/
|
||||
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
|
||||
{
|
||||
struct rb_node *free_head = NULL;
|
||||
struct ocfs2_extent_map_entry *ent = NULL;
|
||||
|
||||
spin_lock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
|
||||
|
||||
if (ent)
|
||||
ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
|
||||
le32_to_cpu(ent->e_rec.e_cpos));
|
||||
|
||||
OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
|
||||
|
||||
spin_unlock(&OCFS2_I(inode)->ip_lock);
|
||||
|
||||
if (free_head)
|
||||
__ocfs2_extent_map_drop_cleanup(free_head);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __init init_ocfs2_extent_maps(void)
|
||||
{
|
||||
ocfs2_em_ent_cachep =
|
||||
kmem_cache_create("ocfs2_em_ent",
|
||||
sizeof(struct ocfs2_extent_map_entry),
|
||||
0, SLAB_HWCACHE_ALIGN, NULL, NULL);
|
||||
if (!ocfs2_em_ent_cachep)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __exit exit_ocfs2_extent_maps(void)
|
||||
{
|
||||
kmem_cache_destroy(ocfs2_em_ent_cachep);
|
||||
}
|
46
fs/ocfs2/extent_map.h
Normal file
46
fs/ocfs2/extent_map.h
Normal file
@ -0,0 +1,46 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* extent_map.h
|
||||
*
|
||||
* In-memory file extent mappings for OCFS2.
|
||||
*
|
||||
* Copyright (C) 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License, version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _EXTENT_MAP_H
|
||||
#define _EXTENT_MAP_H
|
||||
|
||||
int init_ocfs2_extent_maps(void);
|
||||
void exit_ocfs2_extent_maps(void);
|
||||
|
||||
/*
|
||||
* EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
|
||||
* to be held. The allocation cannot change at all while the map is
|
||||
* in the process of being updated.
|
||||
*/
|
||||
int ocfs2_extent_map_init(struct inode *inode);
|
||||
int ocfs2_extent_map_append(struct inode *inode,
|
||||
struct ocfs2_extent_rec *rec,
|
||||
u32 new_clusters);
|
||||
int ocfs2_extent_map_get_blocks(struct inode *inode,
|
||||
u64 v_blkno, int count,
|
||||
u64 *p_blkno, int *ret_count);
|
||||
int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
|
||||
int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
|
||||
|
||||
#endif /* _EXTENT_MAP_H */
|
1237
fs/ocfs2/file.c
Normal file
1237
fs/ocfs2/file.c
Normal file
File diff suppressed because it is too large
Load Diff
57
fs/ocfs2/file.h
Normal file
57
fs/ocfs2/file.h
Normal file
@ -0,0 +1,57 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* file.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_FILE_H
|
||||
#define OCFS2_FILE_H
|
||||
|
||||
extern struct file_operations ocfs2_fops;
|
||||
extern struct file_operations ocfs2_dops;
|
||||
extern struct inode_operations ocfs2_file_iops;
|
||||
extern struct inode_operations ocfs2_special_file_iops;
|
||||
struct ocfs2_alloc_context;
|
||||
|
||||
enum ocfs2_alloc_restarted {
|
||||
RESTART_NONE = 0,
|
||||
RESTART_TRANS,
|
||||
RESTART_META
|
||||
};
|
||||
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
|
||||
struct inode *inode,
|
||||
u32 clusters_to_add,
|
||||
struct buffer_head *fe_bh,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *data_ac,
|
||||
struct ocfs2_alloc_context *meta_ac,
|
||||
enum ocfs2_alloc_restarted *reason);
|
||||
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat);
|
||||
|
||||
int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode,
|
||||
struct buffer_head *fe_bh,
|
||||
u64 new_i_size);
|
||||
|
||||
#endif /* OCFS2_FILE_H */
|
378
fs/ocfs2/heartbeat.c
Normal file
378
fs/ocfs2/heartbeat.c
Normal file
@ -0,0 +1,378 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* heartbeat.c
|
||||
*
|
||||
* Register ourselves with the heartbaet service, keep our node maps
|
||||
* up to date, and fire off recovery when needed.
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/kmod.h>
|
||||
|
||||
#include <cluster/heartbeat.h>
|
||||
#include <cluster/nodemanager.h>
|
||||
|
||||
#include <dlm/dlmapi.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_SUPER
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "heartbeat.h"
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "vote.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
#define OCFS2_HB_NODE_DOWN_PRI (0x0000002)
|
||||
#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI
|
||||
|
||||
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
|
||||
int bit);
|
||||
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
|
||||
int bit);
|
||||
static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
|
||||
static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
|
||||
struct ocfs2_node_map *from);
|
||||
static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
|
||||
struct ocfs2_node_map *from);
|
||||
|
||||
void ocfs2_init_node_maps(struct ocfs2_super *osb)
|
||||
{
|
||||
spin_lock_init(&osb->node_map_lock);
|
||||
ocfs2_node_map_init(&osb->mounted_map);
|
||||
ocfs2_node_map_init(&osb->recovery_map);
|
||||
ocfs2_node_map_init(&osb->umount_map);
|
||||
}
|
||||
|
||||
static void ocfs2_do_node_down(int node_num,
|
||||
struct ocfs2_super *osb)
|
||||
{
|
||||
BUG_ON(osb->node_num == node_num);
|
||||
|
||||
mlog(0, "ocfs2: node down event for %d\n", node_num);
|
||||
|
||||
if (!osb->dlm) {
|
||||
/*
|
||||
* No DLM means we're not even ready to participate yet.
|
||||
* We check the slots after the DLM comes up, so we will
|
||||
* notice the node death then. We can safely ignore it
|
||||
* here.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) {
|
||||
/* If a node is in the umount map, then we've been
|
||||
* expecting him to go down and we know ahead of time
|
||||
* that recovery is not necessary. */
|
||||
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
|
||||
return;
|
||||
}
|
||||
|
||||
ocfs2_recovery_thread(osb, node_num);
|
||||
|
||||
ocfs2_remove_node_from_vote_queues(osb, node_num);
|
||||
}
|
||||
|
||||
static void ocfs2_hb_node_down_cb(struct o2nm_node *node,
|
||||
int node_num,
|
||||
void *data)
|
||||
{
|
||||
ocfs2_do_node_down(node_num, (struct ocfs2_super *) data);
|
||||
}
|
||||
|
||||
/* Called from the dlm when it's about to evict a node. We may also
|
||||
* get a heartbeat callback later. */
|
||||
static void ocfs2_dlm_eviction_cb(int node_num,
|
||||
void *data)
|
||||
{
|
||||
struct ocfs2_super *osb = (struct ocfs2_super *) data;
|
||||
struct super_block *sb = osb->sb;
|
||||
|
||||
mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
|
||||
MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
|
||||
|
||||
ocfs2_do_node_down(node_num, osb);
|
||||
}
|
||||
|
||||
static void ocfs2_hb_node_up_cb(struct o2nm_node *node,
|
||||
int node_num,
|
||||
void *data)
|
||||
{
|
||||
struct ocfs2_super *osb = data;
|
||||
|
||||
BUG_ON(osb->node_num == node_num);
|
||||
|
||||
mlog(0, "node up event for %d\n", node_num);
|
||||
ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num);
|
||||
}
|
||||
|
||||
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
|
||||
{
|
||||
o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB,
|
||||
ocfs2_hb_node_down_cb, osb,
|
||||
OCFS2_HB_NODE_DOWN_PRI);
|
||||
|
||||
o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB,
|
||||
ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI);
|
||||
|
||||
/* Not exactly a heartbeat callback, but leads to essentially
|
||||
* the same path so we set it up here. */
|
||||
dlm_setup_eviction_cb(&osb->osb_eviction_cb,
|
||||
ocfs2_dlm_eviction_cb,
|
||||
osb);
|
||||
}
|
||||
|
||||
/* Most functions here are just stubs for now... */
|
||||
int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
|
||||
{
|
||||
int status;
|
||||
|
||||
status = o2hb_register_callback(&osb->osb_hb_down);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = o2hb_register_callback(&osb->osb_hb_up);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
bail:
|
||||
return status;
|
||||
}
|
||||
|
||||
void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
|
||||
{
|
||||
int status;
|
||||
|
||||
status = o2hb_unregister_callback(&osb->osb_hb_down);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
status = o2hb_unregister_callback(&osb->osb_hb_up);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
}
|
||||
|
||||
void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
|
||||
{
|
||||
int ret;
|
||||
char *argv[5], *envp[3];
|
||||
|
||||
if (!osb->uuid_str) {
|
||||
/* This can happen if we don't get far enough in mount... */
|
||||
mlog(0, "No UUID with which to stop heartbeat!\n\n");
|
||||
return;
|
||||
}
|
||||
|
||||
argv[0] = (char *)o2nm_get_hb_ctl_path();
|
||||
argv[1] = "-K";
|
||||
argv[2] = "-u";
|
||||
argv[3] = osb->uuid_str;
|
||||
argv[4] = NULL;
|
||||
|
||||
mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
|
||||
|
||||
/* minimal command environment taken from cpu_run_sbin_hotplug */
|
||||
envp[0] = "HOME=/";
|
||||
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
|
||||
envp[2] = NULL;
|
||||
|
||||
ret = call_usermodehelper(argv[0], argv, envp, 1);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
}
|
||||
|
||||
/* special case -1 for now
|
||||
* TODO: should *really* make sure the calling func never passes -1!! */
|
||||
void ocfs2_node_map_init(struct ocfs2_node_map *map)
|
||||
{
|
||||
map->num_nodes = OCFS2_NODE_MAP_MAX_NODES;
|
||||
memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) *
|
||||
sizeof(unsigned long));
|
||||
}
|
||||
|
||||
static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
|
||||
int bit)
|
||||
{
|
||||
set_bit(bit, map->map);
|
||||
}
|
||||
|
||||
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int bit)
|
||||
{
|
||||
if (bit==-1)
|
||||
return;
|
||||
BUG_ON(bit >= map->num_nodes);
|
||||
spin_lock(&osb->node_map_lock);
|
||||
__ocfs2_node_map_set_bit(map, bit);
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
}
|
||||
|
||||
static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
|
||||
int bit)
|
||||
{
|
||||
clear_bit(bit, map->map);
|
||||
}
|
||||
|
||||
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int bit)
|
||||
{
|
||||
if (bit==-1)
|
||||
return;
|
||||
BUG_ON(bit >= map->num_nodes);
|
||||
spin_lock(&osb->node_map_lock);
|
||||
__ocfs2_node_map_clear_bit(map, bit);
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
}
|
||||
|
||||
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int bit)
|
||||
{
|
||||
int ret;
|
||||
if (bit >= map->num_nodes) {
|
||||
mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes);
|
||||
BUG();
|
||||
}
|
||||
spin_lock(&osb->node_map_lock);
|
||||
ret = test_bit(bit, map->map);
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
|
||||
{
|
||||
int bit;
|
||||
bit = find_next_bit(map->map, map->num_nodes, 0);
|
||||
if (bit < map->num_nodes)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map)
|
||||
{
|
||||
int ret;
|
||||
BUG_ON(map->num_nodes == 0);
|
||||
spin_lock(&osb->node_map_lock);
|
||||
ret = __ocfs2_node_map_is_empty(map);
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
|
||||
struct ocfs2_node_map *from)
|
||||
{
|
||||
BUG_ON(from->num_nodes == 0);
|
||||
ocfs2_node_map_init(target);
|
||||
__ocfs2_node_map_set(target, from);
|
||||
}
|
||||
|
||||
/* returns 1 if bit is the only bit set in target, 0 otherwise */
|
||||
int ocfs2_node_map_is_only(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *target,
|
||||
int bit)
|
||||
{
|
||||
struct ocfs2_node_map temp;
|
||||
int ret;
|
||||
|
||||
spin_lock(&osb->node_map_lock);
|
||||
__ocfs2_node_map_dup(&temp, target);
|
||||
__ocfs2_node_map_clear_bit(&temp, bit);
|
||||
ret = __ocfs2_node_map_is_empty(&temp);
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
|
||||
struct ocfs2_node_map *from)
|
||||
{
|
||||
int num_longs, i;
|
||||
|
||||
BUG_ON(target->num_nodes != from->num_nodes);
|
||||
BUG_ON(target->num_nodes == 0);
|
||||
|
||||
num_longs = BITS_TO_LONGS(target->num_nodes);
|
||||
for (i = 0; i < num_longs; i++)
|
||||
target->map[i] = from->map[i];
|
||||
}
|
||||
|
||||
/* Returns whether the recovery bit was actually set - it may not be
|
||||
* if a node is still marked as needing recovery */
|
||||
int ocfs2_recovery_map_set(struct ocfs2_super *osb,
|
||||
int num)
|
||||
{
|
||||
int set = 0;
|
||||
|
||||
spin_lock(&osb->node_map_lock);
|
||||
|
||||
__ocfs2_node_map_clear_bit(&osb->mounted_map, num);
|
||||
|
||||
if (!test_bit(num, osb->recovery_map.map)) {
|
||||
__ocfs2_node_map_set_bit(&osb->recovery_map, num);
|
||||
set = 1;
|
||||
}
|
||||
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
|
||||
int num)
|
||||
{
|
||||
ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
|
||||
}
|
||||
|
||||
int ocfs2_node_map_iterate(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int idx)
|
||||
{
|
||||
int i = idx;
|
||||
|
||||
idx = O2NM_INVALID_NODE_NUM;
|
||||
spin_lock(&osb->node_map_lock);
|
||||
if ((i != O2NM_INVALID_NODE_NUM) &&
|
||||
(i >= 0) &&
|
||||
(i < map->num_nodes)) {
|
||||
while(i < map->num_nodes) {
|
||||
if (test_bit(i, map->map)) {
|
||||
idx = i;
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
spin_unlock(&osb->node_map_lock);
|
||||
return idx;
|
||||
}
|
67
fs/ocfs2/heartbeat.h
Normal file
67
fs/ocfs2/heartbeat.h
Normal file
@ -0,0 +1,67 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* heartbeat.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_HEARTBEAT_H
|
||||
#define OCFS2_HEARTBEAT_H
|
||||
|
||||
void ocfs2_init_node_maps(struct ocfs2_super *osb);
|
||||
|
||||
void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb);
|
||||
int ocfs2_register_hb_callbacks(struct ocfs2_super *osb);
|
||||
void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb);
|
||||
void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
|
||||
|
||||
/* node map functions - used to keep track of mounted and in-recovery
|
||||
* nodes. */
|
||||
void ocfs2_node_map_init(struct ocfs2_node_map *map);
|
||||
int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map);
|
||||
void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int bit);
|
||||
void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int bit);
|
||||
int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int bit);
|
||||
int ocfs2_node_map_iterate(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map,
|
||||
int idx);
|
||||
static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *map)
|
||||
{
|
||||
return ocfs2_node_map_iterate(osb, map, 0);
|
||||
}
|
||||
int ocfs2_recovery_map_set(struct ocfs2_super *osb,
|
||||
int num);
|
||||
void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
|
||||
int num);
|
||||
/* returns 1 if bit is the only bit set in target, 0 otherwise */
|
||||
int ocfs2_node_map_is_only(struct ocfs2_super *osb,
|
||||
struct ocfs2_node_map *target,
|
||||
int bit);
|
||||
|
||||
#endif /* OCFS2_HEARTBEAT_H */
|
1140
fs/ocfs2/inode.c
Normal file
1140
fs/ocfs2/inode.c
Normal file
File diff suppressed because it is too large
Load Diff
145
fs/ocfs2/inode.h
Normal file
145
fs/ocfs2/inode.h
Normal file
@ -0,0 +1,145 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* inode.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_INODE_H
|
||||
#define OCFS2_INODE_H
|
||||
|
||||
/* OCFS2 Inode Private Data */
|
||||
struct ocfs2_inode_info
|
||||
{
|
||||
u64 ip_blkno;
|
||||
|
||||
struct ocfs2_lock_res ip_rw_lockres;
|
||||
struct ocfs2_lock_res ip_meta_lockres;
|
||||
struct ocfs2_lock_res ip_data_lockres;
|
||||
|
||||
/* protects allocation changes on this inode. */
|
||||
struct rw_semaphore ip_alloc_sem;
|
||||
|
||||
/* These fields are protected by ip_lock */
|
||||
spinlock_t ip_lock;
|
||||
u32 ip_open_count;
|
||||
u32 ip_clusters;
|
||||
struct ocfs2_extent_map ip_map;
|
||||
struct list_head ip_io_markers;
|
||||
int ip_orphaned_slot;
|
||||
|
||||
struct semaphore ip_io_sem;
|
||||
|
||||
/* Used by the journalling code to attach an inode to a
|
||||
* handle. These are protected by ip_io_sem in order to lock
|
||||
* out other I/O to the inode until we either commit or
|
||||
* abort. */
|
||||
struct list_head ip_handle_list;
|
||||
struct ocfs2_journal_handle *ip_handle;
|
||||
|
||||
u32 ip_flags; /* see below */
|
||||
|
||||
/* protected by recovery_lock. */
|
||||
struct inode *ip_next_orphan;
|
||||
|
||||
u32 ip_dir_start_lookup;
|
||||
|
||||
/* next two are protected by trans_inc_lock */
|
||||
/* which transaction were we created on? Zero if none. */
|
||||
unsigned long ip_created_trans;
|
||||
/* last transaction we were a part of. */
|
||||
unsigned long ip_last_trans;
|
||||
|
||||
struct ocfs2_caching_info ip_metadata_cache;
|
||||
|
||||
struct inode vfs_inode;
|
||||
};
|
||||
|
||||
/*
|
||||
* Flags for the ip_flags field
|
||||
*/
|
||||
/* System file inodes */
|
||||
#define OCFS2_INODE_SYSTEM_FILE 0x00000001
|
||||
#define OCFS2_INODE_JOURNAL 0x00000002
|
||||
#define OCFS2_INODE_BITMAP 0x00000004
|
||||
/* This inode has been wiped from disk */
|
||||
#define OCFS2_INODE_DELETED 0x00000008
|
||||
/* Another node is deleting, so our delete is a nop */
|
||||
#define OCFS2_INODE_SKIP_DELETE 0x00000010
|
||||
/* Has the inode been orphaned on another node?
|
||||
*
|
||||
* This hints to ocfs2_drop_inode that it should clear i_nlink before
|
||||
* continuing.
|
||||
*
|
||||
* We *only* set this on unlink vote from another node. If the inode
|
||||
* was locally orphaned, then we're sure of the state and don't need
|
||||
* to twiddle i_nlink later - it's either zero or not depending on
|
||||
* whether our unlink succeeded. Otherwise we got this from a node
|
||||
* whose intention was to orphan the inode, however he may have
|
||||
* crashed, failed etc, so we let ocfs2_drop_inode zero the value and
|
||||
* rely on ocfs2_delete_inode to sort things out under the proper
|
||||
* cluster locks.
|
||||
*/
|
||||
#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
|
||||
/* Does someone have the file open O_DIRECT */
|
||||
#define OCFS2_INODE_OPEN_DIRECT 0x00000040
|
||||
/* Indicates that the metadata cache should be used as an array. */
|
||||
#define OCFS2_INODE_CACHE_INLINE 0x00000080
|
||||
|
||||
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
|
||||
{
|
||||
return container_of(inode, struct ocfs2_inode_info, vfs_inode);
|
||||
}
|
||||
|
||||
#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL)
|
||||
#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL)
|
||||
|
||||
extern kmem_cache_t *ocfs2_inode_cache;
|
||||
|
||||
extern struct address_space_operations ocfs2_aops;
|
||||
|
||||
struct buffer_head *ocfs2_bread(struct inode *inode, int block,
|
||||
int *err, int reada);
|
||||
void ocfs2_clear_inode(struct inode *inode);
|
||||
void ocfs2_delete_inode(struct inode *inode);
|
||||
void ocfs2_drop_inode(struct inode *inode);
|
||||
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff);
|
||||
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
|
||||
u64 blkno,
|
||||
int delete_vote);
|
||||
int ocfs2_inode_init_private(struct inode *inode);
|
||||
int ocfs2_inode_revalidate(struct dentry *dentry);
|
||||
int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
|
||||
int create_ino);
|
||||
void ocfs2_read_inode(struct inode *inode);
|
||||
void ocfs2_read_inode2(struct inode *inode, void *opaque);
|
||||
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
|
||||
size_t size, loff_t *offp);
|
||||
void ocfs2_sync_blockdev(struct super_block *sb);
|
||||
void ocfs2_refresh_inode(struct inode *inode,
|
||||
struct ocfs2_dinode *fe);
|
||||
int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode,
|
||||
struct buffer_head *bh);
|
||||
int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
|
||||
int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
|
||||
|
||||
#endif /* OCFS2_INODE_H */
|
1652
fs/ocfs2/journal.c
Normal file
1652
fs/ocfs2/journal.c
Normal file
File diff suppressed because it is too large
Load Diff
457
fs/ocfs2/journal.h
Normal file
457
fs/ocfs2/journal.h
Normal file
@ -0,0 +1,457 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* journal.h
|
||||
*
|
||||
* Defines journalling api and structures.
|
||||
*
|
||||
* Copyright (C) 2003, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_JOURNAL_H
|
||||
#define OCFS2_JOURNAL_H
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/jbd.h>
|
||||
|
||||
#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ)
|
||||
|
||||
enum ocfs2_journal_state {
|
||||
OCFS2_JOURNAL_FREE = 0,
|
||||
OCFS2_JOURNAL_LOADED,
|
||||
OCFS2_JOURNAL_IN_SHUTDOWN,
|
||||
};
|
||||
|
||||
struct ocfs2_super;
|
||||
struct ocfs2_dinode;
|
||||
struct ocfs2_journal_handle;
|
||||
|
||||
struct ocfs2_journal {
|
||||
enum ocfs2_journal_state j_state; /* Journals current state */
|
||||
|
||||
journal_t *j_journal; /* The kernels journal type */
|
||||
struct inode *j_inode; /* Kernel inode pointing to
|
||||
* this journal */
|
||||
struct ocfs2_super *j_osb; /* pointer to the super
|
||||
* block for the node
|
||||
* we're currently
|
||||
* running on -- not
|
||||
* necessarily the super
|
||||
* block from the node
|
||||
* which we usually run
|
||||
* from (recovery,
|
||||
* etc) */
|
||||
struct buffer_head *j_bh; /* Journal disk inode block */
|
||||
atomic_t j_num_trans; /* Number of transactions
|
||||
* currently in the system. */
|
||||
unsigned long j_trans_id;
|
||||
struct rw_semaphore j_trans_barrier;
|
||||
wait_queue_head_t j_checkpointed;
|
||||
|
||||
spinlock_t j_lock;
|
||||
struct list_head j_la_cleanups;
|
||||
struct work_struct j_recovery_work;
|
||||
};
|
||||
|
||||
extern spinlock_t trans_inc_lock;
|
||||
|
||||
/* wrap j_trans_id so we never have it equal to zero. */
|
||||
static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
|
||||
{
|
||||
unsigned long old_id;
|
||||
spin_lock(&trans_inc_lock);
|
||||
old_id = j->j_trans_id++;
|
||||
if (unlikely(!j->j_trans_id))
|
||||
j->j_trans_id = 1;
|
||||
spin_unlock(&trans_inc_lock);
|
||||
return old_id;
|
||||
}
|
||||
|
||||
static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal,
|
||||
struct inode *inode)
|
||||
{
|
||||
spin_lock(&trans_inc_lock);
|
||||
OCFS2_I(inode)->ip_last_trans = journal->j_trans_id;
|
||||
spin_unlock(&trans_inc_lock);
|
||||
}
|
||||
|
||||
/* Used to figure out whether it's safe to drop a metadata lock on an
|
||||
* inode. Returns true if all the inodes changes have been
|
||||
* checkpointed to disk. You should be holding the spinlock on the
|
||||
* metadata lock while calling this to be sure that nobody can take
|
||||
* the lock and put it on another transaction. */
|
||||
static inline int ocfs2_inode_fully_checkpointed(struct inode *inode)
|
||||
{
|
||||
int ret;
|
||||
struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal;
|
||||
|
||||
spin_lock(&trans_inc_lock);
|
||||
ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans);
|
||||
spin_unlock(&trans_inc_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* convenience function to check if an inode is still new (has never
|
||||
* hit disk) Will do you a favor and set created_trans = 0 when you've
|
||||
* been checkpointed. returns '1' if the inode is still new. */
|
||||
static inline int ocfs2_inode_is_new(struct inode *inode)
|
||||
{
|
||||
int ret;
|
||||
|
||||
/* System files are never "new" as they're written out by
|
||||
* mkfs. This helps us early during mount, before we have the
|
||||
* journal open and j_trans_id could be junk. */
|
||||
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
|
||||
return 0;
|
||||
spin_lock(&trans_inc_lock);
|
||||
ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id,
|
||||
OCFS2_I(inode)->ip_created_trans));
|
||||
if (!ret)
|
||||
OCFS2_I(inode)->ip_created_trans = 0;
|
||||
spin_unlock(&trans_inc_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
|
||||
struct inode *inode)
|
||||
{
|
||||
spin_lock(&trans_inc_lock);
|
||||
OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id;
|
||||
spin_unlock(&trans_inc_lock);
|
||||
}
|
||||
|
||||
extern kmem_cache_t *ocfs2_lock_cache;
|
||||
|
||||
struct ocfs2_journal_lock {
|
||||
struct inode *jl_inode;
|
||||
struct list_head jl_lock_list;
|
||||
};
|
||||
|
||||
struct ocfs2_journal_handle {
|
||||
handle_t *k_handle; /* kernel handle. */
|
||||
struct ocfs2_journal *journal;
|
||||
u32 flags; /* see flags below. */
|
||||
int max_buffs; /* Buffs reserved by this handle */
|
||||
|
||||
/* The following two fields are for ocfs2_handle_add_lock */
|
||||
int num_locks;
|
||||
struct list_head locks; /* A bunch of locks to
|
||||
* release on commit. This
|
||||
* should be a list_head */
|
||||
|
||||
struct list_head inode_list;
|
||||
};
|
||||
|
||||
#define OCFS2_HANDLE_STARTED 1
|
||||
/* should we sync-commit this handle? */
|
||||
#define OCFS2_HANDLE_SYNC 2
|
||||
static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle)
|
||||
{
|
||||
return handle->flags & OCFS2_HANDLE_STARTED;
|
||||
}
|
||||
|
||||
static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync)
|
||||
{
|
||||
if (sync)
|
||||
handle->flags |= OCFS2_HANDLE_SYNC;
|
||||
else
|
||||
handle->flags &= ~OCFS2_HANDLE_SYNC;
|
||||
}
|
||||
|
||||
/* Exported only for the journal struct init code in super.c. Do not call. */
|
||||
void ocfs2_complete_recovery(void *data);
|
||||
|
||||
/*
|
||||
* Journal Control:
|
||||
* Initialize, Load, Shutdown, Wipe a journal.
|
||||
*
|
||||
* ocfs2_journal_init - Initialize journal structures in the OSB.
|
||||
* ocfs2_journal_load - Load the given journal off disk. Replay it if
|
||||
* there's transactions still in there.
|
||||
* ocfs2_journal_shutdown - Shutdown a journal, this will flush all
|
||||
* uncommitted, uncheckpointed transactions.
|
||||
* ocfs2_journal_wipe - Wipe transactions from a journal. Optionally
|
||||
* zero out each block.
|
||||
* ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb.
|
||||
* ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat
|
||||
* event on.
|
||||
* ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint.
|
||||
*/
|
||||
void ocfs2_set_journal_params(struct ocfs2_super *osb);
|
||||
int ocfs2_journal_init(struct ocfs2_journal *journal,
|
||||
int *dirty);
|
||||
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
|
||||
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
|
||||
int full);
|
||||
int ocfs2_journal_load(struct ocfs2_journal *journal);
|
||||
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
|
||||
void ocfs2_recovery_thread(struct ocfs2_super *osb,
|
||||
int node_num);
|
||||
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
|
||||
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
|
||||
|
||||
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
|
||||
{
|
||||
atomic_set(&osb->needs_checkpoint, 1);
|
||||
wake_up(&osb->checkpoint_event);
|
||||
}
|
||||
|
||||
static inline void ocfs2_checkpoint_inode(struct inode *inode)
|
||||
{
|
||||
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
||||
|
||||
if (!ocfs2_inode_fully_checkpointed(inode)) {
|
||||
/* WARNING: This only kicks off a single
|
||||
* checkpoint. If someone races you and adds more
|
||||
* metadata to the journal, you won't know, and will
|
||||
* wind up waiting *alot* longer than necessary. Right
|
||||
* now we only use this in clear_inode so that's
|
||||
* OK. */
|
||||
ocfs2_start_checkpoint(osb);
|
||||
|
||||
wait_event(osb->journal->j_checkpointed,
|
||||
ocfs2_inode_fully_checkpointed(inode));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Transaction Handling:
|
||||
* Manage the lifetime of a transaction handle.
|
||||
*
|
||||
* ocfs2_alloc_handle - Only allocate a handle so we can start putting
|
||||
* cluster locks on it. To actually change blocks,
|
||||
* call ocfs2_start_trans with the handle returned
|
||||
* from this function. You may call ocfs2_commit_trans
|
||||
* at any time in the lifetime of a handle.
|
||||
* ocfs2_start_trans - Begin a transaction. Give it an upper estimate of
|
||||
* the number of blocks that will be changed during
|
||||
* this handle.
|
||||
* ocfs2_commit_trans - Complete a handle.
|
||||
* ocfs2_extend_trans - Extend a handle by nblocks credits. This may
|
||||
* commit the handle to disk in the process, but will
|
||||
* not release any locks taken during the transaction.
|
||||
* ocfs2_journal_access - Notify the handle that we want to journal this
|
||||
* buffer. Will have to call ocfs2_journal_dirty once
|
||||
* we've actually dirtied it. Type is one of . or .
|
||||
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
|
||||
* ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
|
||||
* the current handle commits.
|
||||
* ocfs2_handle_add_lock - Sometimes we need to delay lock release
|
||||
* until after a transaction has been completed. Use
|
||||
* ocfs2_handle_add_lock to indicate that a lock needs
|
||||
* to be released at the end of that handle. Locks
|
||||
* will be released in the order that they are added.
|
||||
* ocfs2_handle_add_inode - Add a locked inode to a transaction.
|
||||
*/
|
||||
|
||||
/* You must always start_trans with a number of buffs > 0, but it's
|
||||
* perfectly legal to go through an entire transaction without having
|
||||
* dirtied any buffers. */
|
||||
struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb);
|
||||
struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
int max_buffs);
|
||||
void ocfs2_commit_trans(struct ocfs2_journal_handle *handle);
|
||||
int ocfs2_extend_trans(struct ocfs2_journal_handle *handle,
|
||||
int nblocks);
|
||||
|
||||
/*
|
||||
* Create access is for when we get a newly created buffer and we're
|
||||
* not gonna read it off disk, but rather fill it ourselves. Right
|
||||
* now, we don't do anything special with this (it turns into a write
|
||||
* request), but this is a good placeholder in case we do...
|
||||
*
|
||||
* Write access is for when we read a block off disk and are going to
|
||||
* modify it. This way the journalling layer knows it may need to make
|
||||
* a copy of that block (if it's part of another, uncommitted
|
||||
* transaction) before we do so.
|
||||
*/
|
||||
#define OCFS2_JOURNAL_ACCESS_CREATE 0
|
||||
#define OCFS2_JOURNAL_ACCESS_WRITE 1
|
||||
#define OCFS2_JOURNAL_ACCESS_UNDO 2
|
||||
|
||||
int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode,
|
||||
struct buffer_head *bh,
|
||||
int type);
|
||||
/*
|
||||
* A word about the journal_access/journal_dirty "dance". It is
|
||||
* entirely legal to journal_access a buffer more than once (as long
|
||||
* as the access type is the same -- I'm not sure what will happen if
|
||||
* access type is different but this should never happen anyway) It is
|
||||
* also legal to journal_dirty a buffer more than once. In fact, you
|
||||
* can even journal_access a buffer after you've done a
|
||||
* journal_access/journal_dirty pair. The only thing you cannot do
|
||||
* however, is journal_dirty a buffer which you haven't yet passed to
|
||||
* journal_access at least once.
|
||||
*
|
||||
* That said, 99% of the time this doesn't matter and this is what the
|
||||
* path looks like:
|
||||
*
|
||||
* <read a bh>
|
||||
* ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
* <modify the bh>
|
||||
* ocfs2_journal_dirty(handle, bh);
|
||||
*/
|
||||
int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle,
|
||||
struct buffer_head *bh);
|
||||
int ocfs2_journal_dirty_data(handle_t *handle,
|
||||
struct buffer_head *bh);
|
||||
int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode);
|
||||
/*
|
||||
* Use this to protect from other processes reading buffer state while
|
||||
* it's in flight.
|
||||
*/
|
||||
void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode);
|
||||
|
||||
/*
|
||||
* Credit Macros:
|
||||
* Convenience macros to calculate number of credits needed.
|
||||
*
|
||||
* For convenience sake, I have a set of macros here which calculate
|
||||
* the *maximum* number of sectors which will be changed for various
|
||||
* metadata updates.
|
||||
*/
|
||||
|
||||
/* simple file updates like chmod, etc. */
|
||||
#define OCFS2_INODE_UPDATE_CREDITS 1
|
||||
|
||||
/* get one bit out of a suballocator: dinode + group descriptor +
|
||||
* prev. group desc. if we relink. */
|
||||
#define OCFS2_SUBALLOC_ALLOC (3)
|
||||
|
||||
/* dinode + group descriptor update. We don't relink on free yet. */
|
||||
#define OCFS2_SUBALLOC_FREE (2)
|
||||
|
||||
#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS
|
||||
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
|
||||
+ OCFS2_TRUNCATE_LOG_UPDATE)
|
||||
|
||||
/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
|
||||
* bitmap block for the new bit) */
|
||||
#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
|
||||
|
||||
/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
|
||||
* group descriptor + mkdir/symlink blocks */
|
||||
#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
|
||||
+ OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
|
||||
|
||||
/* local alloc metadata change + main bitmap updates */
|
||||
#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
|
||||
+ OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE)
|
||||
|
||||
/* used when we don't need an allocation change for a dir extend. One
|
||||
* for the dinode, one for the new block. */
|
||||
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
|
||||
|
||||
/* file update (nlink, etc) + dir entry block */
|
||||
#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
|
||||
|
||||
/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
|
||||
* dir inode link */
|
||||
#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
|
||||
+ OCFS2_LINK_CREDITS)
|
||||
|
||||
/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
|
||||
* inode alloc group descriptor */
|
||||
#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1)
|
||||
|
||||
/* dinode update, old dir dinode update, new dir dinode update, old
|
||||
* dir dir entry, new dir dir entry, dir entry update for renaming
|
||||
* directory + target unlink */
|
||||
#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
|
||||
+ OCFS2_UNLINK_CREDITS)
|
||||
|
||||
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
|
||||
struct ocfs2_dinode *fe,
|
||||
u32 bits_wanted)
|
||||
{
|
||||
int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
|
||||
|
||||
/* bitmap dinode, group desc. + relinked group. */
|
||||
bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
|
||||
|
||||
/* we might need to shift tree depth so lets assume an
|
||||
* absolute worst case of complete fragmentation. Even with
|
||||
* that, we only need one update for the dinode, and then
|
||||
* however many metadata chunks needed * a remaining suballoc
|
||||
* alloc. */
|
||||
sysfile_bitmap_blocks = 1 +
|
||||
(OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
|
||||
|
||||
/* this does not include *new* metadata blocks, which are
|
||||
* accounted for in sysfile_bitmap_blocks. fe +
|
||||
* prev. last_eb_blk + blocks along edge of tree.
|
||||
* calc_symlink_credits passes because we just need 1
|
||||
* credit for the dinode there. */
|
||||
dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
|
||||
|
||||
return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
|
||||
}
|
||||
|
||||
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
|
||||
{
|
||||
int blocks = OCFS2_MKNOD_CREDITS;
|
||||
|
||||
/* links can be longer than one block so we may update many
|
||||
* within our single allocated extent. */
|
||||
blocks += ocfs2_clusters_to_blocks(sb, 1);
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
|
||||
unsigned int cpg)
|
||||
{
|
||||
int blocks;
|
||||
int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1;
|
||||
/* parent inode update + new block group header + bitmap inode update
|
||||
+ bitmap blocks affected */
|
||||
blocks = 1 + 1 + 1 + bitmap_blocks;
|
||||
return blocks;
|
||||
}
|
||||
|
||||
static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
|
||||
unsigned int clusters_to_del,
|
||||
struct ocfs2_dinode *fe,
|
||||
struct ocfs2_extent_list *last_el)
|
||||
{
|
||||
/* for dinode + all headers in this pass + update to next leaf */
|
||||
u16 next_free = le16_to_cpu(last_el->l_next_free_rec);
|
||||
u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth);
|
||||
int credits = 1 + tree_depth + 1;
|
||||
int i;
|
||||
|
||||
i = next_free - 1;
|
||||
BUG_ON(i < 0);
|
||||
|
||||
/* We may be deleting metadata blocks, so metadata alloc dinode +
|
||||
one desc. block for each possible delete. */
|
||||
if (tree_depth && next_free == 1 &&
|
||||
le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
|
||||
credits += 1 + tree_depth;
|
||||
|
||||
/* update to the truncate log. */
|
||||
credits += OCFS2_TRUNCATE_LOG_UPDATE;
|
||||
|
||||
return credits;
|
||||
}
|
||||
|
||||
#endif /* OCFS2_JOURNAL_H */
|
983
fs/ocfs2/localalloc.c
Normal file
983
fs/ocfs2/localalloc.c
Normal file
@ -0,0 +1,983 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* localalloc.c
|
||||
*
|
||||
* Node local data allocation
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/bitops.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "dlmglue.h"
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "localalloc.h"
|
||||
#include "suballoc.h"
|
||||
#include "super.h"
|
||||
#include "sysfile.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
|
||||
|
||||
static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
|
||||
|
||||
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
|
||||
|
||||
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_dinode *alloc,
|
||||
u32 numbits);
|
||||
|
||||
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
|
||||
|
||||
static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_dinode *alloc,
|
||||
struct inode *main_bm_inode,
|
||||
struct buffer_head *main_bm_bh);
|
||||
|
||||
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context **ac,
|
||||
struct inode **bitmap_inode,
|
||||
struct buffer_head **bitmap_bh);
|
||||
|
||||
static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac);
|
||||
|
||||
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
|
||||
struct inode *local_alloc_inode);
|
||||
|
||||
/*
|
||||
* Determine how large our local alloc window should be, in bits.
|
||||
*
|
||||
* These values (and the behavior in ocfs2_alloc_should_use_local) have
|
||||
* been chosen so that most allocations, including new block groups go
|
||||
* through local alloc.
|
||||
*/
|
||||
static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
|
||||
{
|
||||
BUG_ON(osb->s_clustersize_bits < 12);
|
||||
|
||||
return 2048 >> (osb->s_clustersize_bits - 12);
|
||||
}
|
||||
|
||||
/*
|
||||
* Tell us whether a given allocation should use the local alloc
|
||||
* file. Otherwise, it has to go to the main bitmap.
|
||||
*/
|
||||
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
|
||||
{
|
||||
int la_bits = ocfs2_local_alloc_window_bits(osb);
|
||||
|
||||
if (osb->local_alloc_state != OCFS2_LA_ENABLED)
|
||||
return 0;
|
||||
|
||||
/* la_bits should be at least twice the size (in clusters) of
|
||||
* a new block group. We want to be sure block group
|
||||
* allocations go through the local alloc, so allow an
|
||||
* allocation to take up to half the bitmap. */
|
||||
if (bits > (la_bits / 2))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int ocfs2_load_local_alloc(struct ocfs2_super *osb)
|
||||
{
|
||||
int status = 0;
|
||||
struct ocfs2_dinode *alloc = NULL;
|
||||
struct buffer_head *alloc_bh = NULL;
|
||||
u32 num_used;
|
||||
struct inode *inode = NULL;
|
||||
struct ocfs2_local_alloc *la;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
/* read the alloc off disk */
|
||||
inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
|
||||
osb->slot_num);
|
||||
if (!inode) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
|
||||
&alloc_bh, 0, inode);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
|
||||
la = OCFS2_LOCAL_ALLOC(alloc);
|
||||
|
||||
if (!(le32_to_cpu(alloc->i_flags) &
|
||||
(OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
|
||||
mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n",
|
||||
OCFS2_I(inode)->ip_blkno);
|
||||
status = -EINVAL;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if ((la->la_size == 0) ||
|
||||
(le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
|
||||
mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
|
||||
le16_to_cpu(la->la_size));
|
||||
status = -EINVAL;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* do a little verification. */
|
||||
num_used = ocfs2_local_alloc_count_bits(alloc);
|
||||
|
||||
/* hopefully the local alloc has always been recovered before
|
||||
* we load it. */
|
||||
if (num_used
|
||||
|| alloc->id1.bitmap1.i_used
|
||||
|| alloc->id1.bitmap1.i_total
|
||||
|| la->la_bm_off)
|
||||
mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
|
||||
"found = %u, set = %u, taken = %u, off = %u\n",
|
||||
num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_total),
|
||||
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
|
||||
|
||||
osb->local_alloc_bh = alloc_bh;
|
||||
osb->local_alloc_state = OCFS2_LA_ENABLED;
|
||||
|
||||
bail:
|
||||
if (status < 0)
|
||||
if (alloc_bh)
|
||||
brelse(alloc_bh);
|
||||
if (inode)
|
||||
iput(inode);
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* return any unused bits to the bitmap and write out a clean
|
||||
* local_alloc.
|
||||
*
|
||||
* local_alloc_bh is optional. If not passed, we will simply use the
|
||||
* one off osb. If you do pass it however, be warned that it *will* be
|
||||
* returned brelse'd and NULL'd out.*/
|
||||
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
|
||||
{
|
||||
int status;
|
||||
struct ocfs2_journal_handle *handle = NULL;
|
||||
struct inode *local_alloc_inode = NULL;
|
||||
struct buffer_head *bh = NULL;
|
||||
struct buffer_head *main_bm_bh = NULL;
|
||||
struct inode *main_bm_inode = NULL;
|
||||
struct ocfs2_dinode *alloc_copy = NULL;
|
||||
struct ocfs2_dinode *alloc = NULL;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
|
||||
goto bail;
|
||||
|
||||
local_alloc_inode =
|
||||
ocfs2_get_system_file_inode(osb,
|
||||
LOCAL_ALLOC_SYSTEM_INODE,
|
||||
osb->slot_num);
|
||||
if (!local_alloc_inode) {
|
||||
status = -ENOENT;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
osb->local_alloc_state = OCFS2_LA_DISABLED;
|
||||
|
||||
handle = ocfs2_alloc_handle(osb);
|
||||
if (!handle) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
main_bm_inode = ocfs2_get_system_file_inode(osb,
|
||||
GLOBAL_BITMAP_SYSTEM_INODE,
|
||||
OCFS2_INVALID_SLOT);
|
||||
if (!main_bm_inode) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ocfs2_handle_add_inode(handle, main_bm_inode);
|
||||
status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* WINDOW_MOVE_CREDITS is a bit heavy... */
|
||||
handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
|
||||
if (IS_ERR(handle)) {
|
||||
mlog_errno(PTR_ERR(handle));
|
||||
handle = NULL;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bh = osb->local_alloc_bh;
|
||||
alloc = (struct ocfs2_dinode *) bh->b_data;
|
||||
|
||||
alloc_copy = kmalloc(bh->b_size, GFP_KERNEL);
|
||||
if (!alloc_copy) {
|
||||
status = -ENOMEM;
|
||||
goto bail;
|
||||
}
|
||||
memcpy(alloc_copy, alloc, bh->b_size);
|
||||
|
||||
status = ocfs2_journal_access(handle, local_alloc_inode, bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ocfs2_clear_local_alloc(alloc);
|
||||
|
||||
status = ocfs2_journal_dirty(handle, bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
brelse(bh);
|
||||
osb->local_alloc_bh = NULL;
|
||||
osb->local_alloc_state = OCFS2_LA_UNUSED;
|
||||
|
||||
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
|
||||
main_bm_inode, main_bm_bh);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
bail:
|
||||
if (handle)
|
||||
ocfs2_commit_trans(handle);
|
||||
|
||||
if (main_bm_bh)
|
||||
brelse(main_bm_bh);
|
||||
|
||||
if (main_bm_inode)
|
||||
iput(main_bm_inode);
|
||||
|
||||
if (local_alloc_inode)
|
||||
iput(local_alloc_inode);
|
||||
|
||||
if (alloc_copy)
|
||||
kfree(alloc_copy);
|
||||
|
||||
mlog_exit_void();
|
||||
}
|
||||
|
||||
/*
|
||||
* We want to free the bitmap bits outside of any recovery context as
|
||||
* we'll need a cluster lock to do so, but we must clear the local
|
||||
* alloc before giving up the recovered nodes journal. To solve this,
|
||||
* we kmalloc a copy of the local alloc before it's change for the
|
||||
* caller to process with ocfs2_complete_local_alloc_recovery
|
||||
*/
|
||||
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
|
||||
int slot_num,
|
||||
struct ocfs2_dinode **alloc_copy)
|
||||
{
|
||||
int status = 0;
|
||||
struct buffer_head *alloc_bh = NULL;
|
||||
struct inode *inode = NULL;
|
||||
struct ocfs2_dinode *alloc;
|
||||
|
||||
mlog_entry("(slot_num = %d)\n", slot_num);
|
||||
|
||||
*alloc_copy = NULL;
|
||||
|
||||
inode = ocfs2_get_system_file_inode(osb,
|
||||
LOCAL_ALLOC_SYSTEM_INODE,
|
||||
slot_num);
|
||||
if (!inode) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
down(&inode->i_sem);
|
||||
|
||||
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
|
||||
&alloc_bh, 0, inode);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
|
||||
if (!(*alloc_copy)) {
|
||||
status = -ENOMEM;
|
||||
goto bail;
|
||||
}
|
||||
memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
|
||||
|
||||
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
|
||||
ocfs2_clear_local_alloc(alloc);
|
||||
|
||||
status = ocfs2_write_block(osb, alloc_bh, inode);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
bail:
|
||||
if ((status < 0) && (*alloc_copy)) {
|
||||
kfree(*alloc_copy);
|
||||
*alloc_copy = NULL;
|
||||
}
|
||||
|
||||
if (alloc_bh)
|
||||
brelse(alloc_bh);
|
||||
|
||||
if (inode) {
|
||||
up(&inode->i_sem);
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* Step 2: By now, we've completed the journal recovery, we've stamped
|
||||
* a clean local alloc on disk and dropped the node out of the
|
||||
* recovery map. Dlm locks will no longer stall, so lets clear out the
|
||||
* main bitmap.
|
||||
*/
|
||||
int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
|
||||
struct ocfs2_dinode *alloc)
|
||||
{
|
||||
int status;
|
||||
struct ocfs2_journal_handle *handle = NULL;
|
||||
struct buffer_head *main_bm_bh = NULL;
|
||||
struct inode *main_bm_inode = NULL;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
handle = ocfs2_alloc_handle(osb);
|
||||
if (!handle) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
main_bm_inode = ocfs2_get_system_file_inode(osb,
|
||||
GLOBAL_BITMAP_SYSTEM_INODE,
|
||||
OCFS2_INVALID_SLOT);
|
||||
if (!main_bm_inode) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ocfs2_handle_add_inode(handle, main_bm_inode);
|
||||
status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
|
||||
if (IS_ERR(handle)) {
|
||||
status = PTR_ERR(handle);
|
||||
handle = NULL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* we want the bitmap change to be recorded on disk asap */
|
||||
ocfs2_handle_set_sync(handle, 1);
|
||||
|
||||
status = ocfs2_sync_local_to_main(osb, handle, alloc,
|
||||
main_bm_inode, main_bm_bh);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
bail:
|
||||
if (handle)
|
||||
ocfs2_commit_trans(handle);
|
||||
|
||||
if (main_bm_bh)
|
||||
brelse(main_bm_bh);
|
||||
|
||||
if (main_bm_inode)
|
||||
iput(main_bm_inode);
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* make sure we've got at least bitswanted contiguous bits in the
|
||||
* local alloc. You lose them when you drop i_sem.
|
||||
*
|
||||
* We will add ourselves to the transaction passed in, but may start
|
||||
* our own in order to shift windows.
|
||||
*/
|
||||
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *passed_handle,
|
||||
u32 bits_wanted,
|
||||
struct ocfs2_alloc_context *ac)
|
||||
{
|
||||
int status;
|
||||
struct ocfs2_dinode *alloc;
|
||||
struct inode *local_alloc_inode;
|
||||
unsigned int free_bits;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
BUG_ON(!passed_handle);
|
||||
BUG_ON(!ac);
|
||||
BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED);
|
||||
|
||||
local_alloc_inode =
|
||||
ocfs2_get_system_file_inode(osb,
|
||||
LOCAL_ALLOC_SYSTEM_INODE,
|
||||
osb->slot_num);
|
||||
if (!local_alloc_inode) {
|
||||
status = -ENOENT;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
ocfs2_handle_add_inode(passed_handle, local_alloc_inode);
|
||||
|
||||
if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
|
||||
status = -ENOSPC;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
|
||||
mlog(0, "Asking for more than my max window size!\n");
|
||||
status = -ENOSPC;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
|
||||
|
||||
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
|
||||
ocfs2_local_alloc_count_bits(alloc)) {
|
||||
ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has "
|
||||
"%u free bits, but a count shows %u",
|
||||
le64_to_cpu(alloc->i_blkno),
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_used),
|
||||
ocfs2_local_alloc_count_bits(alloc));
|
||||
status = -EIO;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_used);
|
||||
if (bits_wanted > free_bits) {
|
||||
/* uhoh, window change time. */
|
||||
status =
|
||||
ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
|
||||
ac->ac_inode = igrab(local_alloc_inode);
|
||||
get_bh(osb->local_alloc_bh);
|
||||
ac->ac_bh = osb->local_alloc_bh;
|
||||
ac->ac_which = OCFS2_AC_USE_LOCAL;
|
||||
status = 0;
|
||||
bail:
|
||||
if (local_alloc_inode)
|
||||
iput(local_alloc_inode);
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac,
|
||||
u32 min_bits,
|
||||
u32 *bit_off,
|
||||
u32 *num_bits)
|
||||
{
|
||||
int status, start;
|
||||
struct inode *local_alloc_inode;
|
||||
u32 bits_wanted;
|
||||
void *bitmap;
|
||||
struct ocfs2_dinode *alloc;
|
||||
struct ocfs2_local_alloc *la;
|
||||
|
||||
mlog_entry_void();
|
||||
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
|
||||
|
||||
bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
|
||||
local_alloc_inode = ac->ac_inode;
|
||||
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
|
||||
la = OCFS2_LOCAL_ALLOC(alloc);
|
||||
|
||||
start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
|
||||
if (start == -1) {
|
||||
/* TODO: Shouldn't we just BUG here? */
|
||||
status = -ENOSPC;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bitmap = la->la_bitmap;
|
||||
*bit_off = le32_to_cpu(la->la_bm_off) + start;
|
||||
/* local alloc is always contiguous by nature -- we never
|
||||
* delete bits from it! */
|
||||
*num_bits = bits_wanted;
|
||||
|
||||
status = ocfs2_journal_access(handle, local_alloc_inode,
|
||||
osb->local_alloc_bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
while(bits_wanted--)
|
||||
ocfs2_set_bit(start++, bitmap);
|
||||
|
||||
alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits +
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_used));
|
||||
|
||||
status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = 0;
|
||||
bail:
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
|
||||
{
|
||||
int i;
|
||||
u8 *buffer;
|
||||
u32 count = 0;
|
||||
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
buffer = la->la_bitmap;
|
||||
for (i = 0; i < le16_to_cpu(la->la_size); i++)
|
||||
count += hweight8(buffer[i]);
|
||||
|
||||
mlog_exit(count);
|
||||
return count;
|
||||
}
|
||||
|
||||
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_dinode *alloc,
|
||||
u32 numbits)
|
||||
{
|
||||
int numfound, bitoff, left, startoff, lastzero;
|
||||
void *bitmap = NULL;
|
||||
|
||||
mlog_entry("(numbits wanted = %u)\n", numbits);
|
||||
|
||||
if (!alloc->id1.bitmap1.i_total) {
|
||||
mlog(0, "No bits in my window!\n");
|
||||
bitoff = -1;
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
|
||||
|
||||
numfound = bitoff = startoff = 0;
|
||||
lastzero = -1;
|
||||
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
|
||||
while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
|
||||
if (bitoff == left) {
|
||||
/* mlog(0, "bitoff (%d) == left", bitoff); */
|
||||
break;
|
||||
}
|
||||
/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
|
||||
"numfound = %d\n", bitoff, startoff, numfound);*/
|
||||
|
||||
/* Ok, we found a zero bit... is it contig. or do we
|
||||
* start over?*/
|
||||
if (bitoff == startoff) {
|
||||
/* we found a zero */
|
||||
numfound++;
|
||||
startoff++;
|
||||
} else {
|
||||
/* got a zero after some ones */
|
||||
numfound = 1;
|
||||
startoff = bitoff+1;
|
||||
}
|
||||
/* we got everything we needed */
|
||||
if (numfound == numbits) {
|
||||
/* mlog(0, "Found it all!\n"); */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
|
||||
numfound);
|
||||
|
||||
if (numfound == numbits)
|
||||
bitoff = startoff - numfound;
|
||||
else
|
||||
bitoff = -1;
|
||||
|
||||
bail:
|
||||
mlog_exit(bitoff);
|
||||
return bitoff;
|
||||
}
|
||||
|
||||
static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
|
||||
{
|
||||
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
|
||||
int i;
|
||||
mlog_entry_void();
|
||||
|
||||
alloc->id1.bitmap1.i_total = 0;
|
||||
alloc->id1.bitmap1.i_used = 0;
|
||||
la->la_bm_off = 0;
|
||||
for(i = 0; i < le16_to_cpu(la->la_size); i++)
|
||||
la->la_bitmap[i] = 0;
|
||||
|
||||
mlog_exit_void();
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* turn this on and uncomment below to aid debugging window shifts. */
|
||||
static void ocfs2_verify_zero_bits(unsigned long *bitmap,
|
||||
unsigned int start,
|
||||
unsigned int count)
|
||||
{
|
||||
unsigned int tmp = count;
|
||||
while(tmp--) {
|
||||
if (ocfs2_test_bit(start + tmp, bitmap)) {
|
||||
printk("ocfs2_verify_zero_bits: start = %u, count = "
|
||||
"%u\n", start, count);
|
||||
printk("ocfs2_verify_zero_bits: bit %u is set!",
|
||||
start + tmp);
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* sync the local alloc to main bitmap.
|
||||
*
|
||||
* assumes you've already locked the main bitmap -- the bitmap inode
|
||||
* passed is used for caching.
|
||||
*/
|
||||
static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_dinode *alloc,
|
||||
struct inode *main_bm_inode,
|
||||
struct buffer_head *main_bm_bh)
|
||||
{
|
||||
int status = 0;
|
||||
int bit_off, left, count, start;
|
||||
u64 la_start_blk;
|
||||
u64 blkno;
|
||||
void *bitmap;
|
||||
struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
|
||||
|
||||
mlog_entry("total = %u, COUNT = %u, used = %u\n",
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_total),
|
||||
ocfs2_local_alloc_count_bits(alloc),
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_used));
|
||||
|
||||
if (!alloc->id1.bitmap1.i_total) {
|
||||
mlog(0, "nothing to sync!\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
|
||||
le32_to_cpu(alloc->id1.bitmap1.i_total)) {
|
||||
mlog(0, "all bits were taken!\n");
|
||||
goto bail;
|
||||
}
|
||||
|
||||
la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
|
||||
le32_to_cpu(la->la_bm_off));
|
||||
bitmap = la->la_bitmap;
|
||||
start = count = bit_off = 0;
|
||||
left = le32_to_cpu(alloc->id1.bitmap1.i_total);
|
||||
|
||||
while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
|
||||
!= -1) {
|
||||
if ((bit_off < left) && (bit_off == start)) {
|
||||
count++;
|
||||
start++;
|
||||
continue;
|
||||
}
|
||||
if (count) {
|
||||
blkno = la_start_blk +
|
||||
ocfs2_clusters_to_blocks(osb->sb,
|
||||
start - count);
|
||||
|
||||
mlog(0, "freeing %u bits starting at local "
|
||||
"alloc bit %u (la_start_blk = %"MLFu64", "
|
||||
"blkno = %"MLFu64")\n", count, start - count,
|
||||
la_start_blk, blkno);
|
||||
|
||||
status = ocfs2_free_clusters(handle, main_bm_inode,
|
||||
main_bm_bh, blkno, count);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
}
|
||||
if (bit_off >= left)
|
||||
break;
|
||||
count = 1;
|
||||
start = bit_off + 1;
|
||||
}
|
||||
|
||||
bail:
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context **ac,
|
||||
struct inode **bitmap_inode,
|
||||
struct buffer_head **bitmap_bh)
|
||||
{
|
||||
int status;
|
||||
|
||||
*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
|
||||
if (!(*ac)) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
(*ac)->ac_handle = handle;
|
||||
(*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
|
||||
|
||||
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
*bitmap_inode = (*ac)->ac_inode;
|
||||
igrab(*bitmap_inode);
|
||||
*bitmap_bh = (*ac)->ac_bh;
|
||||
get_bh(*bitmap_bh);
|
||||
status = 0;
|
||||
bail:
|
||||
if ((status < 0) && *ac) {
|
||||
ocfs2_free_alloc_context(*ac);
|
||||
*ac = NULL;
|
||||
}
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* pass it the bitmap lock in lock_bh if you have it.
|
||||
*/
|
||||
static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac)
|
||||
{
|
||||
int status = 0;
|
||||
u32 cluster_off, cluster_count;
|
||||
struct ocfs2_dinode *alloc = NULL;
|
||||
struct ocfs2_local_alloc *la;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
|
||||
la = OCFS2_LOCAL_ALLOC(alloc);
|
||||
|
||||
if (alloc->id1.bitmap1.i_total)
|
||||
mlog(0, "asking me to alloc a new window over a non-empty "
|
||||
"one\n");
|
||||
|
||||
mlog(0, "Allocating %u clusters for a new window.\n",
|
||||
ocfs2_local_alloc_window_bits(osb));
|
||||
/* we used the generic suballoc reserve function, but we set
|
||||
* everything up nicely, so there's no reason why we can't use
|
||||
* the more specific cluster api to claim bits. */
|
||||
status = ocfs2_claim_clusters(osb, handle, ac,
|
||||
ocfs2_local_alloc_window_bits(osb),
|
||||
&cluster_off, &cluster_count);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
la->la_bm_off = cpu_to_le32(cluster_off);
|
||||
alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
|
||||
/* just in case... In the future when we find space ourselves,
|
||||
* we don't have to get all contiguous -- but we'll have to
|
||||
* set all previously used bits in bitmap and update
|
||||
* la_bits_set before setting the bits in the main bitmap. */
|
||||
alloc->id1.bitmap1.i_used = 0;
|
||||
memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
|
||||
le16_to_cpu(la->la_size));
|
||||
|
||||
mlog(0, "New window allocated:\n");
|
||||
mlog(0, "window la_bm_off = %u\n",
|
||||
OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
|
||||
mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
|
||||
|
||||
bail:
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
/* Note that we do *NOT* lock the local alloc inode here as
|
||||
* it's been locked already for us. */
|
||||
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
|
||||
struct inode *local_alloc_inode)
|
||||
{
|
||||
int status = 0;
|
||||
struct buffer_head *main_bm_bh = NULL;
|
||||
struct inode *main_bm_inode = NULL;
|
||||
struct ocfs2_journal_handle *handle = NULL;
|
||||
struct ocfs2_dinode *alloc;
|
||||
struct ocfs2_dinode *alloc_copy = NULL;
|
||||
struct ocfs2_alloc_context *ac = NULL;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
handle = ocfs2_alloc_handle(osb);
|
||||
if (!handle) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* This will lock the main bitmap for us. */
|
||||
status = ocfs2_local_alloc_reserve_for_window(osb,
|
||||
handle,
|
||||
&ac,
|
||||
&main_bm_inode,
|
||||
&main_bm_bh);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS);
|
||||
if (IS_ERR(handle)) {
|
||||
status = PTR_ERR(handle);
|
||||
handle = NULL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
|
||||
|
||||
/* We want to clear the local alloc before doing anything
|
||||
* else, so that if we error later during this operation,
|
||||
* local alloc shutdown won't try to double free main bitmap
|
||||
* bits. Make a copy so the sync function knows which bits to
|
||||
* free. */
|
||||
alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL);
|
||||
if (!alloc_copy) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
|
||||
|
||||
status = ocfs2_journal_access(handle, local_alloc_inode,
|
||||
osb->local_alloc_bh,
|
||||
OCFS2_JOURNAL_ACCESS_WRITE);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
ocfs2_clear_local_alloc(alloc);
|
||||
|
||||
status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
|
||||
main_bm_inode, main_bm_bh);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_local_alloc_new_window(osb, handle, ac);
|
||||
if (status < 0) {
|
||||
if (status != -ENOSPC)
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
atomic_inc(&osb->alloc_stats.moves);
|
||||
|
||||
status = 0;
|
||||
bail:
|
||||
if (handle)
|
||||
ocfs2_commit_trans(handle);
|
||||
|
||||
if (main_bm_bh)
|
||||
brelse(main_bm_bh);
|
||||
|
||||
if (main_bm_inode)
|
||||
iput(main_bm_inode);
|
||||
|
||||
if (alloc_copy)
|
||||
kfree(alloc_copy);
|
||||
|
||||
if (ac)
|
||||
ocfs2_free_alloc_context(ac);
|
||||
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
56
fs/ocfs2/localalloc.h
Normal file
56
fs/ocfs2/localalloc.h
Normal file
@ -0,0 +1,56 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* localalloc.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_LOCALALLOC_H
|
||||
#define OCFS2_LOCALALLOC_H
|
||||
|
||||
int ocfs2_load_local_alloc(struct ocfs2_super *osb);
|
||||
|
||||
void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb);
|
||||
|
||||
int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
|
||||
int node_num,
|
||||
struct ocfs2_dinode **alloc_copy);
|
||||
|
||||
int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
|
||||
struct ocfs2_dinode *alloc);
|
||||
|
||||
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb,
|
||||
u64 bits);
|
||||
|
||||
struct ocfs2_alloc_context;
|
||||
int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *passed_handle,
|
||||
u32 bits_wanted,
|
||||
struct ocfs2_alloc_context *ac);
|
||||
|
||||
int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac,
|
||||
u32 min_bits,
|
||||
u32 *bit_off,
|
||||
u32 *num_bits);
|
||||
|
||||
#endif /* OCFS2_LOCALALLOC_H */
|
102
fs/ocfs2/mmap.c
Normal file
102
fs/ocfs2/mmap.c
Normal file
@ -0,0 +1,102 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* mmap.c
|
||||
*
|
||||
* Code to deal with the mess that is clustered mmap.
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/signal.h>
|
||||
#include <linux/rbtree.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_FILE_IO
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "dlmglue.h"
|
||||
#include "file.h"
|
||||
#include "inode.h"
|
||||
#include "mmap.h"
|
||||
|
||||
static struct page *ocfs2_nopage(struct vm_area_struct * area,
|
||||
unsigned long address,
|
||||
int *type)
|
||||
{
|
||||
struct inode *inode = area->vm_file->f_dentry->d_inode;
|
||||
struct page *page = NOPAGE_SIGBUS;
|
||||
sigset_t blocked, oldset;
|
||||
int ret;
|
||||
|
||||
mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
|
||||
|
||||
/* The best way to deal with signals in this path is
|
||||
* to block them upfront, rather than allowing the
|
||||
* locking paths to return -ERESTARTSYS. */
|
||||
sigfillset(&blocked);
|
||||
|
||||
/* We should technically never get a bad ret return
|
||||
* from sigprocmask */
|
||||
ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
|
||||
if (ret < 0) {
|
||||
mlog_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
page = filemap_nopage(area, address, type);
|
||||
|
||||
ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
|
||||
if (ret < 0)
|
||||
mlog_errno(ret);
|
||||
out:
|
||||
mlog_exit_ptr(page);
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct vm_operations_struct ocfs2_file_vm_ops = {
|
||||
.nopage = ocfs2_nopage,
|
||||
};
|
||||
|
||||
int ocfs2_mmap(struct file *file,
|
||||
struct vm_area_struct *vma)
|
||||
{
|
||||
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
|
||||
struct inode *inode = mapping->host;
|
||||
|
||||
/* We don't want to support shared writable mappings yet. */
|
||||
if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE))
|
||||
&& ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
|
||||
mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
|
||||
/* This is -EINVAL because generic_file_readonly_mmap
|
||||
* returns it in a similar situation. */
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
update_atime(inode);
|
||||
vma->vm_ops = &ocfs2_file_vm_ops;
|
||||
return 0;
|
||||
}
|
||||
|
6
fs/ocfs2/mmap.h
Normal file
6
fs/ocfs2/mmap.h
Normal file
@ -0,0 +1,6 @@
|
||||
#ifndef OCFS2_MMAP_H
|
||||
#define OCFS2_MMAP_H
|
||||
|
||||
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
|
||||
|
||||
#endif /* OCFS2_MMAP_H */
|
2264
fs/ocfs2/namei.c
Normal file
2264
fs/ocfs2/namei.c
Normal file
File diff suppressed because it is too large
Load Diff
58
fs/ocfs2/namei.h
Normal file
58
fs/ocfs2/namei.h
Normal file
@ -0,0 +1,58 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* namei.h
|
||||
*
|
||||
* Function prototypes
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_NAMEI_H
|
||||
#define OCFS2_NAMEI_H
|
||||
|
||||
extern struct inode_operations ocfs2_dir_iops;
|
||||
|
||||
struct dentry *ocfs2_get_parent(struct dentry *child);
|
||||
|
||||
int ocfs2_check_dir_entry (struct inode *dir,
|
||||
struct ocfs2_dir_entry *de,
|
||||
struct buffer_head *bh,
|
||||
unsigned long offset);
|
||||
struct buffer_head *ocfs2_find_entry(const char *name,
|
||||
int namelen,
|
||||
struct inode *dir,
|
||||
struct ocfs2_dir_entry **res_dir);
|
||||
int ocfs2_orphan_del(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct inode *orphan_dir_inode,
|
||||
struct inode *inode,
|
||||
struct buffer_head *orphan_dir_bh);
|
||||
|
||||
static inline int ocfs2_match(int len,
|
||||
const char * const name,
|
||||
struct ocfs2_dir_entry *de)
|
||||
{
|
||||
if (len != de->name_len)
|
||||
return 0;
|
||||
if (!de->inode)
|
||||
return 0;
|
||||
return !memcmp(name, de->name, len);
|
||||
}
|
||||
|
||||
#endif /* OCFS2_NAMEI_H */
|
109
fs/ocfs2/ocfs1_fs_compat.h
Normal file
109
fs/ocfs2/ocfs1_fs_compat.h
Normal file
@ -0,0 +1,109 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs1_fs_compat.h
|
||||
*
|
||||
* OCFS1 volume header definitions. OCFS2 creates valid but unmountable
|
||||
* OCFS1 volume headers on the first two sectors of an OCFS2 volume.
|
||||
* This allows an OCFS1 volume to see the partition and cleanly fail to
|
||||
* mount it.
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License, version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _OCFS1_FS_COMPAT_H
|
||||
#define _OCFS1_FS_COMPAT_H
|
||||
|
||||
#define OCFS1_MAX_VOL_SIGNATURE_LEN 128
|
||||
#define OCFS1_MAX_MOUNT_POINT_LEN 128
|
||||
#define OCFS1_MAX_VOL_ID_LENGTH 16
|
||||
#define OCFS1_MAX_VOL_LABEL_LEN 64
|
||||
#define OCFS1_MAX_CLUSTER_NAME_LEN 64
|
||||
|
||||
#define OCFS1_MAJOR_VERSION (2)
|
||||
#define OCFS1_MINOR_VERSION (0)
|
||||
#define OCFS1_VOLUME_SIGNATURE "OracleCFS"
|
||||
|
||||
/*
|
||||
* OCFS1 superblock. Lives at sector 0.
|
||||
*/
|
||||
struct ocfs1_vol_disk_hdr
|
||||
{
|
||||
/*00*/ __u32 minor_version;
|
||||
__u32 major_version;
|
||||
/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN];
|
||||
/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN];
|
||||
/*108*/ __u64 serial_num;
|
||||
/*110*/ __u64 device_size;
|
||||
__u64 start_off;
|
||||
/*120*/ __u64 bitmap_off;
|
||||
__u64 publ_off;
|
||||
/*130*/ __u64 vote_off;
|
||||
__u64 root_bitmap_off;
|
||||
/*140*/ __u64 data_start_off;
|
||||
__u64 root_bitmap_size;
|
||||
/*150*/ __u64 root_off;
|
||||
__u64 root_size;
|
||||
/*160*/ __u64 cluster_size;
|
||||
__u64 num_nodes;
|
||||
/*170*/ __u64 num_clusters;
|
||||
__u64 dir_node_size;
|
||||
/*180*/ __u64 file_node_size;
|
||||
__u64 internal_off;
|
||||
/*190*/ __u64 node_cfg_off;
|
||||
__u64 node_cfg_size;
|
||||
/*1A0*/ __u64 new_cfg_off;
|
||||
__u32 prot_bits;
|
||||
__s32 excl_mount;
|
||||
/*1B0*/
|
||||
};
|
||||
|
||||
|
||||
struct ocfs1_disk_lock
|
||||
{
|
||||
/*00*/ __u32 curr_master;
|
||||
__u8 file_lock;
|
||||
__u8 compat_pad[3]; /* Not in orignal definition. Used to
|
||||
make the already existing alignment
|
||||
explicit */
|
||||
__u64 last_write_time;
|
||||
/*10*/ __u64 last_read_time;
|
||||
__u32 writer_node_num;
|
||||
__u32 reader_node_num;
|
||||
/*20*/ __u64 oin_node_map;
|
||||
__u64 dlock_seq_num;
|
||||
/*30*/
|
||||
};
|
||||
|
||||
/*
|
||||
* OCFS1 volume label. Lives at sector 1.
|
||||
*/
|
||||
struct ocfs1_vol_label
|
||||
{
|
||||
/*00*/ struct ocfs1_disk_lock disk_lock;
|
||||
/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN];
|
||||
/*70*/ __u16 label_len;
|
||||
/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH];
|
||||
/*82*/ __u16 vol_id_len;
|
||||
/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN];
|
||||
/*A4*/ __u16 cluster_name_len;
|
||||
/*A6*/
|
||||
};
|
||||
|
||||
|
||||
#endif /* _OCFS1_FS_COMPAT_H */
|
||||
|
464
fs/ocfs2/ocfs2.h
Normal file
464
fs/ocfs2/ocfs2.h
Normal file
@ -0,0 +1,464 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2.h
|
||||
*
|
||||
* Defines macros and structures used in OCFS2
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_H
|
||||
#define OCFS2_H
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/kref.h>
|
||||
|
||||
#include "cluster/nodemanager.h"
|
||||
#include "cluster/heartbeat.h"
|
||||
#include "cluster/tcp.h"
|
||||
|
||||
#include "dlm/dlmapi.h"
|
||||
|
||||
#include "ocfs2_fs.h"
|
||||
#include "endian.h"
|
||||
#include "ocfs2_lockid.h"
|
||||
|
||||
struct ocfs2_extent_map {
|
||||
u32 em_clusters;
|
||||
struct rb_root em_extents;
|
||||
};
|
||||
|
||||
/* Most user visible OCFS2 inodes will have very few pieces of
|
||||
* metadata, but larger files (including bitmaps, etc) must be taken
|
||||
* into account when designing an access scheme. We allow a small
|
||||
* amount of inlined blocks to be stored on an array and grow the
|
||||
* structure into a rb tree when necessary. */
|
||||
#define OCFS2_INODE_MAX_CACHE_ARRAY 2
|
||||
|
||||
struct ocfs2_caching_info {
|
||||
unsigned int ci_num_cached;
|
||||
union {
|
||||
sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY];
|
||||
struct rb_root ci_tree;
|
||||
} ci_cache;
|
||||
};
|
||||
|
||||
/* this limits us to 256 nodes
|
||||
* if we need more, we can do a kmalloc for the map */
|
||||
#define OCFS2_NODE_MAP_MAX_NODES 256
|
||||
struct ocfs2_node_map {
|
||||
u16 num_nodes;
|
||||
unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)];
|
||||
};
|
||||
|
||||
enum ocfs2_ast_action {
|
||||
OCFS2_AST_INVALID = 0,
|
||||
OCFS2_AST_ATTACH,
|
||||
OCFS2_AST_CONVERT,
|
||||
OCFS2_AST_DOWNCONVERT,
|
||||
};
|
||||
|
||||
/* actions for an unlockast function to take. */
|
||||
enum ocfs2_unlock_action {
|
||||
OCFS2_UNLOCK_INVALID = 0,
|
||||
OCFS2_UNLOCK_CANCEL_CONVERT,
|
||||
OCFS2_UNLOCK_DROP_LOCK,
|
||||
};
|
||||
|
||||
/* ocfs2_lock_res->l_flags flags. */
|
||||
#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized
|
||||
* the lvb */
|
||||
#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in
|
||||
* dlm_lock */
|
||||
#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to
|
||||
* downconvert*/
|
||||
#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */
|
||||
#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010)
|
||||
#define OCFS2_LOCK_REFRESHING (0x00000020)
|
||||
#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization
|
||||
* for shutdown paths */
|
||||
#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track
|
||||
* when to skip queueing
|
||||
* a lock because it's
|
||||
* about to be
|
||||
* dropped. */
|
||||
#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
|
||||
|
||||
struct ocfs2_lock_res_ops;
|
||||
|
||||
typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
|
||||
|
||||
struct ocfs2_lock_res {
|
||||
void *l_priv;
|
||||
struct ocfs2_lock_res_ops *l_ops;
|
||||
spinlock_t l_lock;
|
||||
|
||||
struct list_head l_blocked_list;
|
||||
struct list_head l_mask_waiters;
|
||||
|
||||
enum ocfs2_lock_type l_type;
|
||||
unsigned long l_flags;
|
||||
char l_name[OCFS2_LOCK_ID_MAX_LEN];
|
||||
int l_level;
|
||||
unsigned int l_ro_holders;
|
||||
unsigned int l_ex_holders;
|
||||
struct dlm_lockstatus l_lksb;
|
||||
|
||||
/* used from AST/BAST funcs. */
|
||||
enum ocfs2_ast_action l_action;
|
||||
enum ocfs2_unlock_action l_unlock_action;
|
||||
int l_requested;
|
||||
int l_blocking;
|
||||
|
||||
wait_queue_head_t l_event;
|
||||
|
||||
struct list_head l_debug_list;
|
||||
};
|
||||
|
||||
struct ocfs2_dlm_debug {
|
||||
struct kref d_refcnt;
|
||||
struct dentry *d_locking_state;
|
||||
struct list_head d_lockres_tracking;
|
||||
};
|
||||
|
||||
enum ocfs2_vol_state
|
||||
{
|
||||
VOLUME_INIT = 0,
|
||||
VOLUME_MOUNTED,
|
||||
VOLUME_DISMOUNTED,
|
||||
VOLUME_DISABLED
|
||||
};
|
||||
|
||||
struct ocfs2_alloc_stats
|
||||
{
|
||||
atomic_t moves;
|
||||
atomic_t local_data;
|
||||
atomic_t bitmap_data;
|
||||
atomic_t bg_allocs;
|
||||
atomic_t bg_extends;
|
||||
};
|
||||
|
||||
enum ocfs2_local_alloc_state
|
||||
{
|
||||
OCFS2_LA_UNUSED = 0,
|
||||
OCFS2_LA_ENABLED,
|
||||
OCFS2_LA_DISABLED
|
||||
};
|
||||
|
||||
enum ocfs2_mount_options
|
||||
{
|
||||
OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */
|
||||
OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */
|
||||
OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */
|
||||
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
|
||||
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
|
||||
#ifdef OCFS2_ORACORE_WORKAROUNDS
|
||||
OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */
|
||||
#endif
|
||||
};
|
||||
|
||||
#define OCFS2_OSB_SOFT_RO 0x0001
|
||||
#define OCFS2_OSB_HARD_RO 0x0002
|
||||
#define OCFS2_OSB_ERROR_FS 0x0004
|
||||
|
||||
struct ocfs2_journal;
|
||||
struct ocfs2_journal_handle;
|
||||
struct ocfs2_super
|
||||
{
|
||||
u32 osb_id; /* id used by the proc interface */
|
||||
struct task_struct *commit_task;
|
||||
struct super_block *sb;
|
||||
struct inode *root_inode;
|
||||
struct inode *sys_root_inode;
|
||||
struct inode *system_inodes[NUM_SYSTEM_INODES];
|
||||
|
||||
struct ocfs2_slot_info *slot_info;
|
||||
|
||||
spinlock_t node_map_lock;
|
||||
struct ocfs2_node_map mounted_map;
|
||||
struct ocfs2_node_map recovery_map;
|
||||
struct ocfs2_node_map umount_map;
|
||||
|
||||
u32 num_clusters;
|
||||
u64 root_blkno;
|
||||
u64 system_dir_blkno;
|
||||
u64 bitmap_blkno;
|
||||
u32 bitmap_cpg;
|
||||
u8 *uuid;
|
||||
char *uuid_str;
|
||||
u8 *vol_label;
|
||||
u64 first_cluster_group_blkno;
|
||||
u32 fs_generation;
|
||||
|
||||
u32 s_feature_compat;
|
||||
u32 s_feature_incompat;
|
||||
u32 s_feature_ro_compat;
|
||||
|
||||
/* Protects s_next_generaion, osb_flags. Could protect more on
|
||||
* osb as it's very short lived. */
|
||||
spinlock_t osb_lock;
|
||||
u32 s_next_generation;
|
||||
unsigned long osb_flags;
|
||||
|
||||
unsigned long s_mount_opt;
|
||||
|
||||
u16 max_slots;
|
||||
u16 num_nodes;
|
||||
s16 node_num;
|
||||
s16 slot_num;
|
||||
int s_sectsize_bits;
|
||||
int s_clustersize;
|
||||
int s_clustersize_bits;
|
||||
struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
|
||||
|
||||
atomic_t vol_state;
|
||||
struct semaphore recovery_lock;
|
||||
struct task_struct *recovery_thread_task;
|
||||
int disable_recovery;
|
||||
wait_queue_head_t checkpoint_event;
|
||||
atomic_t needs_checkpoint;
|
||||
struct ocfs2_journal *journal;
|
||||
|
||||
enum ocfs2_local_alloc_state local_alloc_state;
|
||||
struct buffer_head *local_alloc_bh;
|
||||
|
||||
/* Next two fields are for local node slot recovery during
|
||||
* mount. */
|
||||
int dirty;
|
||||
struct ocfs2_dinode *local_alloc_copy;
|
||||
|
||||
struct ocfs2_alloc_stats alloc_stats;
|
||||
char dev_str[20]; /* "major,minor" of the device */
|
||||
|
||||
struct dlm_ctxt *dlm;
|
||||
struct ocfs2_lock_res osb_super_lockres;
|
||||
struct ocfs2_lock_res osb_rename_lockres;
|
||||
struct dlm_eviction_cb osb_eviction_cb;
|
||||
struct ocfs2_dlm_debug *osb_dlm_debug;
|
||||
|
||||
struct dentry *osb_debug_root;
|
||||
|
||||
wait_queue_head_t recovery_event;
|
||||
|
||||
spinlock_t vote_task_lock;
|
||||
struct task_struct *vote_task;
|
||||
wait_queue_head_t vote_event;
|
||||
unsigned long vote_wake_sequence;
|
||||
unsigned long vote_work_sequence;
|
||||
|
||||
struct list_head blocked_lock_list;
|
||||
unsigned long blocked_lock_count;
|
||||
|
||||
struct list_head vote_list;
|
||||
int vote_count;
|
||||
|
||||
u32 net_key;
|
||||
spinlock_t net_response_lock;
|
||||
unsigned int net_response_ids;
|
||||
struct list_head net_response_list;
|
||||
|
||||
struct o2hb_callback_func osb_hb_up;
|
||||
struct o2hb_callback_func osb_hb_down;
|
||||
|
||||
struct list_head osb_net_handlers;
|
||||
|
||||
wait_queue_head_t osb_mount_event;
|
||||
|
||||
/* Truncate log info */
|
||||
struct inode *osb_tl_inode;
|
||||
struct buffer_head *osb_tl_bh;
|
||||
struct work_struct osb_truncate_log_wq;
|
||||
};
|
||||
|
||||
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
|
||||
#define OCFS2_MAX_OSB_ID 65536
|
||||
|
||||
static inline int ocfs2_should_order_data(struct inode *inode)
|
||||
{
|
||||
if (!S_ISREG(inode->i_mode))
|
||||
return 0;
|
||||
if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* set / clear functions because cluster events can make these happen
|
||||
* in parallel so we want the transitions to be atomic. this also
|
||||
* means that any future flags osb_flags must be protected by spinlock
|
||||
* too! */
|
||||
static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
|
||||
unsigned long flag)
|
||||
{
|
||||
spin_lock(&osb->osb_lock);
|
||||
osb->osb_flags |= flag;
|
||||
spin_unlock(&osb->osb_lock);
|
||||
}
|
||||
|
||||
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
|
||||
int hard)
|
||||
{
|
||||
spin_lock(&osb->osb_lock);
|
||||
osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO);
|
||||
if (hard)
|
||||
osb->osb_flags |= OCFS2_OSB_HARD_RO;
|
||||
else
|
||||
osb->osb_flags |= OCFS2_OSB_SOFT_RO;
|
||||
spin_unlock(&osb->osb_lock);
|
||||
}
|
||||
|
||||
static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb)
|
||||
{
|
||||
int ret;
|
||||
|
||||
spin_lock(&osb->osb_lock);
|
||||
ret = osb->osb_flags & OCFS2_OSB_HARD_RO;
|
||||
spin_unlock(&osb->osb_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
|
||||
{
|
||||
int ret;
|
||||
|
||||
spin_lock(&osb->osb_lock);
|
||||
ret = osb->osb_flags & OCFS2_OSB_SOFT_RO;
|
||||
spin_unlock(&osb->osb_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define OCFS2_IS_VALID_DINODE(ptr) \
|
||||
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
|
||||
|
||||
#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
|
||||
typeof(__di) ____di = (__di); \
|
||||
ocfs2_error((__sb), \
|
||||
"Dinode # %"MLFu64" has bad signature %.*s", \
|
||||
(____di)->i_blkno, 7, \
|
||||
(____di)->i_signature); \
|
||||
} while (0);
|
||||
|
||||
#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
|
||||
(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
|
||||
|
||||
#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
|
||||
typeof(__eb) ____eb = (__eb); \
|
||||
ocfs2_error((__sb), \
|
||||
"Extent Block # %"MLFu64" has bad signature %.*s", \
|
||||
(____eb)->h_blkno, 7, \
|
||||
(____eb)->h_signature); \
|
||||
} while (0);
|
||||
|
||||
#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
|
||||
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
|
||||
|
||||
#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
|
||||
typeof(__gd) ____gd = (__gd); \
|
||||
ocfs2_error((__sb), \
|
||||
"Group Descriptor # %"MLFu64" has bad signature %.*s", \
|
||||
(____gd)->bg_blkno, 7, \
|
||||
(____gd)->bg_signature); \
|
||||
} while (0);
|
||||
|
||||
static inline unsigned long ino_from_blkno(struct super_block *sb,
|
||||
u64 blkno)
|
||||
{
|
||||
return (unsigned long)(blkno & (u64)ULONG_MAX);
|
||||
}
|
||||
|
||||
static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb,
|
||||
u32 clusters)
|
||||
{
|
||||
int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits -
|
||||
sb->s_blocksize_bits;
|
||||
|
||||
return (u64)clusters << c_to_b_bits;
|
||||
}
|
||||
|
||||
static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb,
|
||||
u64 blocks)
|
||||
{
|
||||
int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits -
|
||||
sb->s_blocksize_bits;
|
||||
|
||||
return (u32)(blocks >> b_to_c_bits);
|
||||
}
|
||||
|
||||
static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
|
||||
u64 bytes)
|
||||
{
|
||||
int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
|
||||
unsigned int clusters;
|
||||
|
||||
bytes += OCFS2_SB(sb)->s_clustersize - 1;
|
||||
/* OCFS2 just cannot have enough clusters to overflow this */
|
||||
clusters = (unsigned int)(bytes >> cl_bits);
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
|
||||
u64 bytes)
|
||||
{
|
||||
bytes += sb->s_blocksize - 1;
|
||||
return bytes >> sb->s_blocksize_bits;
|
||||
}
|
||||
|
||||
static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb,
|
||||
u32 clusters)
|
||||
{
|
||||
return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits;
|
||||
}
|
||||
|
||||
static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb,
|
||||
u64 bytes)
|
||||
{
|
||||
int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
|
||||
unsigned int clusters;
|
||||
|
||||
clusters = ocfs2_clusters_for_bytes(sb, bytes);
|
||||
return (u64)clusters << cl_bits;
|
||||
}
|
||||
|
||||
static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb,
|
||||
u64 bytes)
|
||||
{
|
||||
u64 blocks;
|
||||
|
||||
blocks = ocfs2_blocks_for_bytes(sb, bytes);
|
||||
return blocks << sb->s_blocksize_bits;
|
||||
}
|
||||
|
||||
static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
|
||||
{
|
||||
return (unsigned long)((bytes + 511) >> 9);
|
||||
}
|
||||
|
||||
#define ocfs2_set_bit ext2_set_bit
|
||||
#define ocfs2_clear_bit ext2_clear_bit
|
||||
#define ocfs2_test_bit ext2_test_bit
|
||||
#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
|
||||
#endif /* OCFS2_H */
|
||||
|
638
fs/ocfs2/ocfs2_fs.h
Normal file
638
fs/ocfs2/ocfs2_fs.h
Normal file
@ -0,0 +1,638 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_fs.h
|
||||
*
|
||||
* On-disk structures for OCFS2.
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License, version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _OCFS2_FS_H
|
||||
#define _OCFS2_FS_H
|
||||
|
||||
/* Version */
|
||||
#define OCFS2_MAJOR_REV_LEVEL 0
|
||||
#define OCFS2_MINOR_REV_LEVEL 90
|
||||
|
||||
/*
|
||||
* An OCFS2 volume starts this way:
|
||||
* Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS.
|
||||
* Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS.
|
||||
* Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock.
|
||||
*
|
||||
* All other structures are found from the superblock information.
|
||||
*
|
||||
* OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a
|
||||
* blocksize of 2K, it is 4096 bytes into disk.
|
||||
*/
|
||||
#define OCFS2_SUPER_BLOCK_BLKNO 2
|
||||
|
||||
/*
|
||||
* Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could
|
||||
* grow if needed.
|
||||
*/
|
||||
#define OCFS2_MIN_CLUSTERSIZE 4096
|
||||
#define OCFS2_MAX_CLUSTERSIZE 1048576
|
||||
|
||||
/*
|
||||
* Blocks cannot be bigger than clusters, so the maximum blocksize is the
|
||||
* minimum cluster size.
|
||||
*/
|
||||
#define OCFS2_MIN_BLOCKSIZE 512
|
||||
#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE
|
||||
|
||||
/* Filesystem magic number */
|
||||
#define OCFS2_SUPER_MAGIC 0x7461636f
|
||||
|
||||
/* Object signatures */
|
||||
#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2"
|
||||
#define OCFS2_INODE_SIGNATURE "INODE01"
|
||||
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
|
||||
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
|
||||
|
||||
/* Compatibility flags */
|
||||
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
|
||||
( OCFS2_SB(sb)->s_feature_compat & (mask) )
|
||||
#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \
|
||||
( OCFS2_SB(sb)->s_feature_ro_compat & (mask) )
|
||||
#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \
|
||||
( OCFS2_SB(sb)->s_feature_incompat & (mask) )
|
||||
#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \
|
||||
OCFS2_SB(sb)->s_feature_compat |= (mask)
|
||||
#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \
|
||||
OCFS2_SB(sb)->s_feature_ro_compat |= (mask)
|
||||
#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \
|
||||
OCFS2_SB(sb)->s_feature_incompat |= (mask)
|
||||
#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \
|
||||
OCFS2_SB(sb)->s_feature_compat &= ~(mask)
|
||||
#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \
|
||||
OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask)
|
||||
#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \
|
||||
OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
|
||||
|
||||
#define OCFS2_FEATURE_COMPAT_SUPP 0
|
||||
#define OCFS2_FEATURE_INCOMPAT_SUPP 0
|
||||
#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
|
||||
|
||||
/*
|
||||
* Heartbeat-only devices are missing journals and other files. The
|
||||
* filesystem driver can't load them, but the library can. Never put
|
||||
* this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*.
|
||||
*/
|
||||
#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002
|
||||
|
||||
|
||||
/*
|
||||
* Flags on ocfs2_dinode.i_flags
|
||||
*/
|
||||
#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */
|
||||
#define OCFS2_UNUSED2_FL (0x00000002)
|
||||
#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */
|
||||
#define OCFS2_UNUSED3_FL (0x00000008)
|
||||
/* System inode flags */
|
||||
#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */
|
||||
#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */
|
||||
#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */
|
||||
#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */
|
||||
#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */
|
||||
#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
|
||||
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
|
||||
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
|
||||
|
||||
/*
|
||||
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
|
||||
*/
|
||||
#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
|
||||
|
||||
/*
|
||||
* superblock s_state flags
|
||||
*/
|
||||
#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */
|
||||
|
||||
/* Limit of space in ocfs2_dir_entry */
|
||||
#define OCFS2_MAX_FILENAME_LEN 255
|
||||
|
||||
/* Maximum slots on an ocfs2 file system */
|
||||
#define OCFS2_MAX_SLOTS 255
|
||||
|
||||
/* Slot map indicator for an empty slot */
|
||||
#define OCFS2_INVALID_SLOT -1
|
||||
|
||||
#define OCFS2_VOL_UUID_LEN 16
|
||||
#define OCFS2_MAX_VOL_LABEL_LEN 64
|
||||
|
||||
/* Journal limits (in bytes) */
|
||||
#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
|
||||
#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024)
|
||||
|
||||
struct ocfs2_system_inode_info {
|
||||
char *si_name;
|
||||
int si_iflags;
|
||||
int si_mode;
|
||||
};
|
||||
|
||||
/* System file index */
|
||||
enum {
|
||||
BAD_BLOCK_SYSTEM_INODE = 0,
|
||||
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
|
||||
SLOT_MAP_SYSTEM_INODE,
|
||||
#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
|
||||
HEARTBEAT_SYSTEM_INODE,
|
||||
GLOBAL_BITMAP_SYSTEM_INODE,
|
||||
#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
|
||||
ORPHAN_DIR_SYSTEM_INODE,
|
||||
EXTENT_ALLOC_SYSTEM_INODE,
|
||||
INODE_ALLOC_SYSTEM_INODE,
|
||||
JOURNAL_SYSTEM_INODE,
|
||||
LOCAL_ALLOC_SYSTEM_INODE,
|
||||
TRUNCATE_LOG_SYSTEM_INODE,
|
||||
NUM_SYSTEM_INODES
|
||||
};
|
||||
|
||||
static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
|
||||
/* Global system inodes (single copy) */
|
||||
/* The first two are only used from userspace mfks/tunefs */
|
||||
[BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 },
|
||||
[GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
|
||||
|
||||
/* These are used by the running filesystem */
|
||||
[SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
|
||||
[HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
|
||||
[GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
|
||||
|
||||
/* Slot-specific system inodes (one copy per slot) */
|
||||
[ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
|
||||
[EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
|
||||
[INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
|
||||
[JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
|
||||
[LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
|
||||
[TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
|
||||
};
|
||||
|
||||
/* Parameter passed from mount.ocfs2 to module */
|
||||
#define OCFS2_HB_NONE "heartbeat=none"
|
||||
#define OCFS2_HB_LOCAL "heartbeat=local"
|
||||
|
||||
/*
|
||||
* OCFS2 directory file types. Only the low 3 bits are used. The
|
||||
* other bits are reserved for now.
|
||||
*/
|
||||
#define OCFS2_FT_UNKNOWN 0
|
||||
#define OCFS2_FT_REG_FILE 1
|
||||
#define OCFS2_FT_DIR 2
|
||||
#define OCFS2_FT_CHRDEV 3
|
||||
#define OCFS2_FT_BLKDEV 4
|
||||
#define OCFS2_FT_FIFO 5
|
||||
#define OCFS2_FT_SOCK 6
|
||||
#define OCFS2_FT_SYMLINK 7
|
||||
|
||||
#define OCFS2_FT_MAX 8
|
||||
|
||||
/*
|
||||
* OCFS2_DIR_PAD defines the directory entries boundaries
|
||||
*
|
||||
* NOTE: It must be a multiple of 4
|
||||
*/
|
||||
#define OCFS2_DIR_PAD 4
|
||||
#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1)
|
||||
#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name)
|
||||
#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \
|
||||
OCFS2_DIR_ROUND) & \
|
||||
~OCFS2_DIR_ROUND)
|
||||
|
||||
#define OCFS2_LINK_MAX 32000
|
||||
|
||||
#define S_SHIFT 12
|
||||
static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
|
||||
[S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE,
|
||||
[S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR,
|
||||
[S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV,
|
||||
[S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV,
|
||||
[S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO,
|
||||
[S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK,
|
||||
[S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK,
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Convenience casts
|
||||
*/
|
||||
#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
|
||||
|
||||
/*
|
||||
* On disk extent record for OCFS2
|
||||
* It describes a range of clusters on disk.
|
||||
*/
|
||||
struct ocfs2_extent_rec {
|
||||
/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */
|
||||
__le32 e_clusters; /* Clusters covered by this extent */
|
||||
__le64 e_blkno; /* Physical disk offset, in blocks */
|
||||
/*10*/
|
||||
};
|
||||
|
||||
struct ocfs2_chain_rec {
|
||||
__le32 c_free; /* Number of free bits in this chain. */
|
||||
__le32 c_total; /* Number of total bits in this chain */
|
||||
__le64 c_blkno; /* Physical disk offset (blocks) of 1st group */
|
||||
};
|
||||
|
||||
struct ocfs2_truncate_rec {
|
||||
__le32 t_start; /* 1st cluster in this log */
|
||||
__le32 t_clusters; /* Number of total clusters covered */
|
||||
};
|
||||
|
||||
/*
|
||||
* On disk extent list for OCFS2 (node in the tree). Note that this
|
||||
* is contained inside ocfs2_dinode or ocfs2_extent_block, so the
|
||||
* offsets are relative to ocfs2_dinode.id2.i_list or
|
||||
* ocfs2_extent_block.h_list, respectively.
|
||||
*/
|
||||
struct ocfs2_extent_list {
|
||||
/*00*/ __le16 l_tree_depth; /* Extent tree depth from this
|
||||
point. 0 means data extents
|
||||
hang directly off this
|
||||
header (a leaf) */
|
||||
__le16 l_count; /* Number of extent records */
|
||||
__le16 l_next_free_rec; /* Next unused extent slot */
|
||||
__le16 l_reserved1;
|
||||
__le64 l_reserved2; /* Pad to
|
||||
sizeof(ocfs2_extent_rec) */
|
||||
/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */
|
||||
};
|
||||
|
||||
/*
|
||||
* On disk allocation chain list for OCFS2. Note that this is
|
||||
* contained inside ocfs2_dinode, so the offsets are relative to
|
||||
* ocfs2_dinode.id2.i_chain.
|
||||
*/
|
||||
struct ocfs2_chain_list {
|
||||
/*00*/ __le16 cl_cpg; /* Clusters per Block Group */
|
||||
__le16 cl_bpc; /* Bits per cluster */
|
||||
__le16 cl_count; /* Total chains in this list */
|
||||
__le16 cl_next_free_rec; /* Next unused chain slot */
|
||||
__le64 cl_reserved1;
|
||||
/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */
|
||||
};
|
||||
|
||||
/*
|
||||
* On disk deallocation log for OCFS2. Note that this is
|
||||
* contained inside ocfs2_dinode, so the offsets are relative to
|
||||
* ocfs2_dinode.id2.i_dealloc.
|
||||
*/
|
||||
struct ocfs2_truncate_log {
|
||||
/*00*/ __le16 tl_count; /* Total records in this log */
|
||||
__le16 tl_used; /* Number of records in use */
|
||||
__le32 tl_reserved1;
|
||||
/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */
|
||||
};
|
||||
|
||||
/*
|
||||
* On disk extent block (indirect block) for OCFS2
|
||||
*/
|
||||
struct ocfs2_extent_block
|
||||
{
|
||||
/*00*/ __u8 h_signature[8]; /* Signature for verification */
|
||||
__le64 h_reserved1;
|
||||
/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
|
||||
extent_header belongs to */
|
||||
__le16 h_suballoc_bit; /* Bit offset in suballocator
|
||||
block group */
|
||||
__le32 h_fs_generation; /* Must match super block */
|
||||
__le64 h_blkno; /* Offset on disk, in blocks */
|
||||
/*20*/ __le64 h_reserved3;
|
||||
__le64 h_next_leaf_blk; /* Offset on disk, in blocks,
|
||||
of next leaf header pointing
|
||||
to data */
|
||||
/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */
|
||||
/* Actual on-disk size is one block */
|
||||
};
|
||||
|
||||
/*
|
||||
* On disk superblock for OCFS2
|
||||
* Note that it is contained inside an ocfs2_dinode, so all offsets
|
||||
* are relative to the start of ocfs2_dinode.id2.
|
||||
*/
|
||||
struct ocfs2_super_block {
|
||||
/*00*/ __le16 s_major_rev_level;
|
||||
__le16 s_minor_rev_level;
|
||||
__le16 s_mnt_count;
|
||||
__le16 s_max_mnt_count;
|
||||
__le16 s_state; /* File system state */
|
||||
__le16 s_errors; /* Behaviour when detecting errors */
|
||||
__le32 s_checkinterval; /* Max time between checks */
|
||||
/*10*/ __le64 s_lastcheck; /* Time of last check */
|
||||
__le32 s_creator_os; /* OS */
|
||||
__le32 s_feature_compat; /* Compatible feature set */
|
||||
/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */
|
||||
__le32 s_feature_ro_compat; /* Readonly-compatible feature set */
|
||||
__le64 s_root_blkno; /* Offset, in blocks, of root directory
|
||||
dinode */
|
||||
/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system
|
||||
directory dinode */
|
||||
__le32 s_blocksize_bits; /* Blocksize for this fs */
|
||||
__le32 s_clustersize_bits; /* Clustersize for this fs */
|
||||
/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
|
||||
before tunefs required */
|
||||
__le16 s_reserved1;
|
||||
__le32 s_reserved2;
|
||||
__le64 s_first_cluster_group; /* Block offset of 1st cluster
|
||||
* group header */
|
||||
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
|
||||
/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
|
||||
/*A0*/
|
||||
};
|
||||
|
||||
/*
|
||||
* Local allocation bitmap for OCFS2 slots
|
||||
* Note that it exists inside an ocfs2_dinode, so all offsets are
|
||||
* relative to the start of ocfs2_dinode.id2.
|
||||
*/
|
||||
struct ocfs2_local_alloc
|
||||
{
|
||||
/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */
|
||||
__le16 la_size; /* Size of included bitmap, in bytes */
|
||||
__le16 la_reserved1;
|
||||
__le64 la_reserved2;
|
||||
/*10*/ __u8 la_bitmap[0];
|
||||
};
|
||||
|
||||
/*
|
||||
* On disk inode for OCFS2
|
||||
*/
|
||||
struct ocfs2_dinode {
|
||||
/*00*/ __u8 i_signature[8]; /* Signature for validation */
|
||||
__le32 i_generation; /* Generation number */
|
||||
__le16 i_suballoc_slot; /* Slot suballocator this inode
|
||||
belongs to */
|
||||
__le16 i_suballoc_bit; /* Bit offset in suballocator
|
||||
block group */
|
||||
/*10*/ __le32 i_reserved0;
|
||||
__le32 i_clusters; /* Cluster count */
|
||||
__le32 i_uid; /* Owner UID */
|
||||
__le32 i_gid; /* Owning GID */
|
||||
/*20*/ __le64 i_size; /* Size in bytes */
|
||||
__le16 i_mode; /* File mode */
|
||||
__le16 i_links_count; /* Links count */
|
||||
__le32 i_flags; /* File flags */
|
||||
/*30*/ __le64 i_atime; /* Access time */
|
||||
__le64 i_ctime; /* Creation time */
|
||||
/*40*/ __le64 i_mtime; /* Modification time */
|
||||
__le64 i_dtime; /* Deletion time */
|
||||
/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */
|
||||
__le64 i_last_eb_blk; /* Pointer to last extent
|
||||
block */
|
||||
/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */
|
||||
__le32 i_atime_nsec;
|
||||
__le32 i_ctime_nsec;
|
||||
__le32 i_mtime_nsec;
|
||||
/*70*/ __le64 i_reserved1[9];
|
||||
/*B8*/ union {
|
||||
__le64 i_pad1; /* Generic way to refer to this
|
||||
64bit union */
|
||||
struct {
|
||||
__le64 i_rdev; /* Device number */
|
||||
} dev1;
|
||||
struct { /* Info for bitmap system
|
||||
inodes */
|
||||
__le32 i_used; /* Bits (ie, clusters) used */
|
||||
__le32 i_total; /* Total bits (clusters)
|
||||
available */
|
||||
} bitmap1;
|
||||
struct { /* Info for journal system
|
||||
inodes */
|
||||
__le32 ij_flags; /* Mounted, version, etc. */
|
||||
__le32 ij_pad;
|
||||
} journal1;
|
||||
} id1; /* Inode type dependant 1 */
|
||||
/*C0*/ union {
|
||||
struct ocfs2_super_block i_super;
|
||||
struct ocfs2_local_alloc i_lab;
|
||||
struct ocfs2_chain_list i_chain;
|
||||
struct ocfs2_extent_list i_list;
|
||||
struct ocfs2_truncate_log i_dealloc;
|
||||
__u8 i_symlink[0];
|
||||
} id2;
|
||||
/* Actual on-disk size is one block */
|
||||
};
|
||||
|
||||
/*
|
||||
* On-disk directory entry structure for OCFS2
|
||||
*
|
||||
* Packed as this structure could be accessed unaligned on 64-bit platforms
|
||||
*/
|
||||
struct ocfs2_dir_entry {
|
||||
/*00*/ __le64 inode; /* Inode number */
|
||||
__le16 rec_len; /* Directory entry length */
|
||||
__u8 name_len; /* Name length */
|
||||
__u8 file_type;
|
||||
/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */
|
||||
/* Actual on-disk length specified by rec_len */
|
||||
} __attribute__ ((packed));
|
||||
|
||||
/*
|
||||
* On disk allocator group structure for OCFS2
|
||||
*/
|
||||
struct ocfs2_group_desc
|
||||
{
|
||||
/*00*/ __u8 bg_signature[8]; /* Signature for validation */
|
||||
__le16 bg_size; /* Size of included bitmap in
|
||||
bytes. */
|
||||
__le16 bg_bits; /* Bits represented by this
|
||||
group. */
|
||||
__le16 bg_free_bits_count; /* Free bits count */
|
||||
__le16 bg_chain; /* What chain I am in. */
|
||||
/*10*/ __le32 bg_generation;
|
||||
__le32 bg_reserved1;
|
||||
__le64 bg_next_group; /* Next group in my list, in
|
||||
blocks */
|
||||
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
|
||||
blocks */
|
||||
__le64 bg_blkno; /* Offset on disk, in blocks */
|
||||
/*30*/ __le64 bg_reserved2[2];
|
||||
/*40*/ __u8 bg_bitmap[0];
|
||||
};
|
||||
|
||||
#ifdef __KERNEL__
|
||||
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
|
||||
{
|
||||
return sb->s_blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_symlink);
|
||||
}
|
||||
|
||||
static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = sb->s_blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_extent_rec);
|
||||
}
|
||||
|
||||
static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = sb->s_blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_chain_rec);
|
||||
}
|
||||
|
||||
static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = sb->s_blocksize -
|
||||
offsetof(struct ocfs2_extent_block, h_list.l_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_extent_rec);
|
||||
}
|
||||
|
||||
static inline u16 ocfs2_local_alloc_size(struct super_block *sb)
|
||||
{
|
||||
u16 size;
|
||||
|
||||
size = sb->s_blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static inline int ocfs2_group_bitmap_size(struct super_block *sb)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = sb->s_blocksize -
|
||||
offsetof(struct ocfs2_group_desc, bg_bitmap);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = sb->s_blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_truncate_rec);
|
||||
}
|
||||
#else
|
||||
static inline int ocfs2_fast_symlink_chars(int blocksize)
|
||||
{
|
||||
return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
|
||||
}
|
||||
|
||||
static inline int ocfs2_extent_recs_per_inode(int blocksize)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_extent_rec);
|
||||
}
|
||||
|
||||
static inline int ocfs2_chain_recs_per_inode(int blocksize)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_chain_rec);
|
||||
}
|
||||
|
||||
static inline int ocfs2_extent_recs_per_eb(int blocksize)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = blocksize -
|
||||
offsetof(struct ocfs2_extent_block, h_list.l_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_extent_rec);
|
||||
}
|
||||
|
||||
static inline int ocfs2_local_alloc_size(int blocksize)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static inline int ocfs2_group_bitmap_size(int blocksize)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = blocksize -
|
||||
offsetof(struct ocfs2_group_desc, bg_bitmap);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static inline int ocfs2_truncate_recs_per_inode(int blocksize)
|
||||
{
|
||||
int size;
|
||||
|
||||
size = blocksize -
|
||||
offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs);
|
||||
|
||||
return size / sizeof(struct ocfs2_truncate_rec);
|
||||
}
|
||||
#endif /* __KERNEL__ */
|
||||
|
||||
|
||||
static inline int ocfs2_system_inode_is_global(int type)
|
||||
{
|
||||
return ((type >= 0) &&
|
||||
(type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE));
|
||||
}
|
||||
|
||||
static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
|
||||
int type, int slot)
|
||||
{
|
||||
int chars;
|
||||
|
||||
/*
|
||||
* Global system inodes can only have one copy. Everything
|
||||
* after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode
|
||||
* list has a copy per slot.
|
||||
*/
|
||||
if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
|
||||
chars = snprintf(buf, len,
|
||||
ocfs2_system_inodes[type].si_name);
|
||||
else
|
||||
chars = snprintf(buf, len,
|
||||
ocfs2_system_inodes[type].si_name,
|
||||
slot);
|
||||
|
||||
return chars;
|
||||
}
|
||||
|
||||
static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de,
|
||||
umode_t mode)
|
||||
{
|
||||
de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
|
||||
}
|
||||
|
||||
#endif /* _OCFS2_FS_H */
|
||||
|
73
fs/ocfs2/ocfs2_lockid.h
Normal file
73
fs/ocfs2/ocfs2_lockid.h
Normal file
@ -0,0 +1,73 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* ocfs2_lockid.h
|
||||
*
|
||||
* Defines OCFS2 lockid bits.
|
||||
*
|
||||
* Copyright (C) 2002, 2005 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef OCFS2_LOCKID_H
|
||||
#define OCFS2_LOCKID_H
|
||||
|
||||
/* lock ids are made up in the following manner:
|
||||
* name[0] --> type
|
||||
* name[1-6] --> 6 pad characters, reserved for now
|
||||
* name[7-22] --> block number, expressed in hex as 16 chars
|
||||
* name[23-30] --> i_generation, expressed in hex 8 chars
|
||||
* name[31] --> '\0' */
|
||||
#define OCFS2_LOCK_ID_MAX_LEN 32
|
||||
#define OCFS2_LOCK_ID_PAD "000000"
|
||||
|
||||
enum ocfs2_lock_type {
|
||||
OCFS2_LOCK_TYPE_META = 0,
|
||||
OCFS2_LOCK_TYPE_DATA,
|
||||
OCFS2_LOCK_TYPE_SUPER,
|
||||
OCFS2_LOCK_TYPE_RENAME,
|
||||
OCFS2_LOCK_TYPE_RW,
|
||||
OCFS2_NUM_LOCK_TYPES
|
||||
};
|
||||
|
||||
static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
|
||||
{
|
||||
char c;
|
||||
switch (type) {
|
||||
case OCFS2_LOCK_TYPE_META:
|
||||
c = 'M';
|
||||
break;
|
||||
case OCFS2_LOCK_TYPE_DATA:
|
||||
c = 'D';
|
||||
break;
|
||||
case OCFS2_LOCK_TYPE_SUPER:
|
||||
c = 'S';
|
||||
break;
|
||||
case OCFS2_LOCK_TYPE_RENAME:
|
||||
c = 'R';
|
||||
break;
|
||||
case OCFS2_LOCK_TYPE_RW:
|
||||
c = 'W';
|
||||
break;
|
||||
default:
|
||||
c = '\0';
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif /* OCFS2_LOCKID_H */
|
303
fs/ocfs2/slot_map.c
Normal file
303
fs/ocfs2/slot_map.c
Normal file
@ -0,0 +1,303 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* slot_map.c
|
||||
*
|
||||
*
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/smp_lock.h>
|
||||
|
||||
#define MLOG_MASK_PREFIX ML_SUPER
|
||||
#include <cluster/masklog.h>
|
||||
|
||||
#include "ocfs2.h"
|
||||
|
||||
#include "dlmglue.h"
|
||||
#include "extent_map.h"
|
||||
#include "heartbeat.h"
|
||||
#include "inode.h"
|
||||
#include "slot_map.h"
|
||||
#include "super.h"
|
||||
#include "sysfile.h"
|
||||
|
||||
#include "buffer_head_io.h"
|
||||
|
||||
static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
|
||||
s16 global);
|
||||
static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
|
||||
s16 slot_num,
|
||||
s16 node_num);
|
||||
|
||||
/* Use the slot information we've collected to create a map of mounted
|
||||
* nodes. Should be holding an EX on super block. assumes slot info is
|
||||
* up to date. Note that we call this *after* we find a slot, so our
|
||||
* own node should be set in the map too... */
|
||||
void ocfs2_populate_mounted_map(struct ocfs2_super *osb)
|
||||
{
|
||||
int i;
|
||||
struct ocfs2_slot_info *si = osb->slot_info;
|
||||
|
||||
spin_lock(&si->si_lock);
|
||||
|
||||
for (i = 0; i < si->si_size; i++)
|
||||
if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT)
|
||||
ocfs2_node_map_set_bit(osb, &osb->mounted_map,
|
||||
si->si_global_node_nums[i]);
|
||||
|
||||
spin_unlock(&si->si_lock);
|
||||
}
|
||||
|
||||
/* post the slot information on disk into our slot_info struct. */
|
||||
void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
|
||||
{
|
||||
int i;
|
||||
__le16 *disk_info;
|
||||
|
||||
/* we don't read the slot block here as ocfs2_super_lock
|
||||
* should've made sure we have the most recent copy. */
|
||||
spin_lock(&si->si_lock);
|
||||
disk_info = (__le16 *) si->si_bh->b_data;
|
||||
|
||||
for (i = 0; i < si->si_size; i++)
|
||||
si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]);
|
||||
|
||||
spin_unlock(&si->si_lock);
|
||||
}
|
||||
|
||||
/* post the our slot info stuff into it's destination bh and write it
|
||||
* out. */
|
||||
int ocfs2_update_disk_slots(struct ocfs2_super *osb,
|
||||
struct ocfs2_slot_info *si)
|
||||
{
|
||||
int status, i;
|
||||
__le16 *disk_info = (__le16 *) si->si_bh->b_data;
|
||||
|
||||
spin_lock(&si->si_lock);
|
||||
for (i = 0; i < si->si_size; i++)
|
||||
disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]);
|
||||
spin_unlock(&si->si_lock);
|
||||
|
||||
status = ocfs2_write_block(osb, si->si_bh, si->si_inode);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/* try to find global node in the slot info. Returns
|
||||
* OCFS2_INVALID_SLOT if nothing is found. */
|
||||
static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
|
||||
s16 global)
|
||||
{
|
||||
int i;
|
||||
s16 ret = OCFS2_INVALID_SLOT;
|
||||
|
||||
for(i = 0; i < si->si_num_slots; i++) {
|
||||
if (global == si->si_global_node_nums[i]) {
|
||||
ret = (s16) i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
|
||||
{
|
||||
int i;
|
||||
s16 ret = OCFS2_INVALID_SLOT;
|
||||
|
||||
for(i = 0; i < si->si_num_slots; i++) {
|
||||
if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
|
||||
ret = (s16) i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
|
||||
s16 global)
|
||||
{
|
||||
s16 ret;
|
||||
|
||||
spin_lock(&si->si_lock);
|
||||
ret = __ocfs2_node_num_to_slot(si, global);
|
||||
spin_unlock(&si->si_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __ocfs2_fill_slot(struct ocfs2_slot_info *si,
|
||||
s16 slot_num,
|
||||
s16 node_num)
|
||||
{
|
||||
BUG_ON(slot_num == OCFS2_INVALID_SLOT);
|
||||
BUG_ON(slot_num >= si->si_num_slots);
|
||||
BUG_ON((node_num != O2NM_INVALID_NODE_NUM) &&
|
||||
(node_num >= O2NM_MAX_NODES));
|
||||
|
||||
si->si_global_node_nums[slot_num] = node_num;
|
||||
}
|
||||
|
||||
void ocfs2_clear_slot(struct ocfs2_slot_info *si,
|
||||
s16 slot_num)
|
||||
{
|
||||
spin_lock(&si->si_lock);
|
||||
__ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT);
|
||||
spin_unlock(&si->si_lock);
|
||||
}
|
||||
|
||||
int ocfs2_init_slot_info(struct ocfs2_super *osb)
|
||||
{
|
||||
int status, i;
|
||||
u64 blkno;
|
||||
struct inode *inode = NULL;
|
||||
struct buffer_head *bh = NULL;
|
||||
struct ocfs2_slot_info *si;
|
||||
|
||||
si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL);
|
||||
if (!si) {
|
||||
status = -ENOMEM;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
spin_lock_init(&si->si_lock);
|
||||
si->si_num_slots = osb->max_slots;
|
||||
si->si_size = OCFS2_MAX_SLOTS;
|
||||
|
||||
for(i = 0; i < si->si_num_slots; i++)
|
||||
si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
|
||||
|
||||
inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
|
||||
OCFS2_INVALID_SLOT);
|
||||
if (!inode) {
|
||||
status = -EINVAL;
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
si->si_inode = inode;
|
||||
si->si_bh = bh;
|
||||
osb->slot_info = si;
|
||||
bail:
|
||||
if (status < 0 && si)
|
||||
ocfs2_free_slot_info(si);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void ocfs2_free_slot_info(struct ocfs2_slot_info *si)
|
||||
{
|
||||
if (si->si_inode)
|
||||
iput(si->si_inode);
|
||||
if (si->si_bh)
|
||||
brelse(si->si_bh);
|
||||
kfree(si);
|
||||
}
|
||||
|
||||
int ocfs2_find_slot(struct ocfs2_super *osb)
|
||||
{
|
||||
int status;
|
||||
s16 slot;
|
||||
struct ocfs2_slot_info *si;
|
||||
|
||||
mlog_entry_void();
|
||||
|
||||
si = osb->slot_info;
|
||||
|
||||
ocfs2_update_slot_info(si);
|
||||
|
||||
spin_lock(&si->si_lock);
|
||||
/* search for ourselves first and take the slot if it already
|
||||
* exists. Perhaps we need to mark this in a variable for our
|
||||
* own journal recovery? Possibly not, though we certainly
|
||||
* need to warn to the user */
|
||||
slot = __ocfs2_node_num_to_slot(si, osb->node_num);
|
||||
if (slot == OCFS2_INVALID_SLOT) {
|
||||
/* if no slot yet, then just take 1st available
|
||||
* one. */
|
||||
slot = __ocfs2_find_empty_slot(si);
|
||||
if (slot == OCFS2_INVALID_SLOT) {
|
||||
spin_unlock(&si->si_lock);
|
||||
mlog(ML_ERROR, "no free slots available!\n");
|
||||
status = -EINVAL;
|
||||
goto bail;
|
||||
}
|
||||
} else
|
||||
mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
|
||||
slot);
|
||||
|
||||
__ocfs2_fill_slot(si, slot, osb->node_num);
|
||||
osb->slot_num = slot;
|
||||
spin_unlock(&si->si_lock);
|
||||
|
||||
mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
|
||||
|
||||
status = ocfs2_update_disk_slots(osb, si);
|
||||
if (status < 0)
|
||||
mlog_errno(status);
|
||||
|
||||
bail:
|
||||
mlog_exit(status);
|
||||
return status;
|
||||
}
|
||||
|
||||
void ocfs2_put_slot(struct ocfs2_super *osb)
|
||||
{
|
||||
int status;
|
||||
struct ocfs2_slot_info *si = osb->slot_info;
|
||||
|
||||
if (!si)
|
||||
return;
|
||||
|
||||
ocfs2_update_slot_info(si);
|
||||
|
||||
spin_lock(&si->si_lock);
|
||||
__ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT);
|
||||
osb->slot_num = OCFS2_INVALID_SLOT;
|
||||
spin_unlock(&si->si_lock);
|
||||
|
||||
status = ocfs2_update_disk_slots(osb, si);
|
||||
if (status < 0) {
|
||||
mlog_errno(status);
|
||||
goto bail;
|
||||
}
|
||||
|
||||
bail:
|
||||
osb->slot_info = NULL;
|
||||
ocfs2_free_slot_info(si);
|
||||
}
|
||||
|
66
fs/ocfs2/slot_map.h
Normal file
66
fs/ocfs2/slot_map.h
Normal file
@ -0,0 +1,66 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* slotmap.h
|
||||
*
|
||||
* description here
|
||||
*
|
||||
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef SLOTMAP_H
|
||||
#define SLOTMAP_H
|
||||
|
||||
struct ocfs2_slot_info {
|
||||
spinlock_t si_lock;
|
||||
|
||||
struct inode *si_inode;
|
||||
struct buffer_head *si_bh;
|
||||
unsigned int si_num_slots;
|
||||
unsigned int si_size;
|
||||
s16 si_global_node_nums[OCFS2_MAX_SLOTS];
|
||||
};
|
||||
|
||||
int ocfs2_init_slot_info(struct ocfs2_super *osb);
|
||||
void ocfs2_free_slot_info(struct ocfs2_slot_info *si);
|
||||
|
||||
int ocfs2_find_slot(struct ocfs2_super *osb);
|
||||
void ocfs2_put_slot(struct ocfs2_super *osb);
|
||||
|
||||
void ocfs2_update_slot_info(struct ocfs2_slot_info *si);
|
||||
int ocfs2_update_disk_slots(struct ocfs2_super *osb,
|
||||
struct ocfs2_slot_info *si);
|
||||
|
||||
s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
|
||||
s16 global);
|
||||
void ocfs2_clear_slot(struct ocfs2_slot_info *si,
|
||||
s16 slot_num);
|
||||
|
||||
void ocfs2_populate_mounted_map(struct ocfs2_super *osb);
|
||||
|
||||
static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si,
|
||||
int slot_num)
|
||||
{
|
||||
BUG_ON(slot_num == OCFS2_INVALID_SLOT);
|
||||
assert_spin_locked(&si->si_lock);
|
||||
|
||||
return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT;
|
||||
}
|
||||
|
||||
#endif
|
1651
fs/ocfs2/suballoc.c
Normal file
1651
fs/ocfs2/suballoc.c
Normal file
File diff suppressed because it is too large
Load Diff
132
fs/ocfs2/suballoc.h
Normal file
132
fs/ocfs2/suballoc.h
Normal file
@ -0,0 +1,132 @@
|
||||
/* -*- mode: c; c-basic-offset: 8; -*-
|
||||
* vim: noexpandtab sw=8 ts=8 sts=0:
|
||||
*
|
||||
* suballoc.h
|
||||
*
|
||||
* Defines sub allocator api
|
||||
*
|
||||
* Copyright (C) 2003, 2004 Oracle. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public
|
||||
* License along with this program; if not, write to the
|
||||
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
* Boston, MA 021110-1307, USA.
|
||||
*/
|
||||
|
||||
#ifndef _CHAINALLOC_H_
|
||||
#define _CHAINALLOC_H_
|
||||
|
||||
typedef int (group_search_t)(struct inode *,
|
||||
struct buffer_head *,
|
||||
u32,
|
||||
u32,
|
||||
u16 *,
|
||||
u16 *);
|
||||
|
||||
struct ocfs2_alloc_context {
|
||||
struct inode *ac_inode; /* which bitmap are we allocating from? */
|
||||
struct buffer_head *ac_bh; /* file entry bh */
|
||||
u32 ac_bits_wanted;
|
||||
u32 ac_bits_given;
|
||||
#define OCFS2_AC_USE_LOCAL 1
|
||||
#define OCFS2_AC_USE_MAIN 2
|
||||
#define OCFS2_AC_USE_INODE 3
|
||||
#define OCFS2_AC_USE_META 4
|
||||
u32 ac_which;
|
||||
struct ocfs2_journal_handle *ac_handle;
|
||||
|
||||
/* these are used by the chain search */
|
||||
u16 ac_chain;
|
||||
int ac_allow_chain_relink;
|
||||
group_search_t *ac_group_search;
|
||||
};
|
||||
|
||||
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
|
||||
static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
|
||||
{
|
||||
return ac->ac_bits_wanted - ac->ac_bits_given;
|
||||
}
|
||||
|
||||
int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_dinode *fe,
|
||||
struct ocfs2_alloc_context **ac);
|
||||
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context **ac);
|
||||
int ocfs2_reserve_clusters(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
u32 bits_wanted,
|
||||
struct ocfs2_alloc_context **ac);
|
||||
|
||||
int ocfs2_claim_metadata(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac,
|
||||
u32 bits_wanted,
|
||||
u16 *suballoc_bit_start,
|
||||
u32 *num_bits,
|
||||
u64 *blkno_start);
|
||||
int ocfs2_claim_new_inode(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac,
|
||||
u16 *suballoc_bit,
|
||||
u64 *fe_blkno);
|
||||
int ocfs2_claim_clusters(struct ocfs2_super *osb,
|
||||
struct ocfs2_journal_handle *handle,
|
||||
struct ocfs2_alloc_context *ac,
|
||||
u32 min_clusters,
|
||||
u32 *cluster_start,
|
||||
u32 *num_clusters);
|
||||
|
||||
int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
|
||||
struct inode *inode_alloc_inode,
|
||||
struct buffer_head *inode_alloc_bh,
|
||||
struct ocfs2_dinode *di);
|
||||
int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
|
||||
struct inode *eb_alloc_inode,
|
||||
struct buffer_head *eb_alloc_bh,
|
||||
struct ocfs2_extent_block *eb);
|
||||
int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
|
||||
struct inode *bitmap_inode,
|
||||
struct buffer_head *bitmap_bh,
|
||||
u64 start_blk,
|
||||
unsigned int num_clusters);
|
||||
|
||||
static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
|
||||
u64 bg_blkno)
|
||||
{
|
||||
/* This should work for all block group descriptors as only
|
||||
* the 1st group descriptor of the cluster bitmap is
|
||||
* different. */
|
||||
|
||||
if (bg_blkno == osb->first_cluster_group_blkno)
|
||||
return 0;
|
||||
|
||||
/* the rest of the block groups are located at the beginning
|
||||
* of their 1st cluster, so a direct translation just
|
||||
* works. */
|
||||
return ocfs2_blocks_to_clusters(osb->sb, bg_blkno);
|
||||
}
|
||||
|
||||
static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
|
||||
{
|
||||
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
|
||||
return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno;
|
||||
}
|
||||
|
||||
/* This is for local alloc ONLY. Others should use the task-specific
|
||||
* apis above. */
|
||||
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
|
||||
struct ocfs2_alloc_context *ac);
|
||||
|
||||
#endif /* _CHAINALLOC_H_ */
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user