dlm: allow dlm do recovery during shutdown
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / fs / ocfs2 / dlmglue.c
CommitLineData
ccd979bd
MF
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmglue.c
5 *
6 * Code which implements an OCFS2 specific interface to our DLM.
7 *
8 * Copyright (C) 2003, 2004 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 */
25
26#include <linux/types.h>
27#include <linux/slab.h>
28#include <linux/highmem.h>
29#include <linux/mm.h>
ccd979bd
MF
30#include <linux/kthread.h>
31#include <linux/pagemap.h>
32#include <linux/debugfs.h>
33#include <linux/seq_file.h>
8ddb7b00 34#include <linux/time.h>
9e33d69f 35#include <linux/quotaops.h>
ccd979bd 36
ccd979bd
MF
37#define MLOG_MASK_PREFIX ML_DLM_GLUE
38#include <cluster/masklog.h>
39
40#include "ocfs2.h"
d24fbcda 41#include "ocfs2_lockingver.h"
ccd979bd
MF
42
43#include "alloc.h"
d680efe9 44#include "dcache.h"
ccd979bd
MF
45#include "dlmglue.h"
46#include "extent_map.h"
7f1a37e3 47#include "file.h"
ccd979bd
MF
48#include "heartbeat.h"
49#include "inode.h"
50#include "journal.h"
24ef1815 51#include "stackglue.h"
ccd979bd
MF
52#include "slot_map.h"
53#include "super.h"
54#include "uptodate.h"
9e33d69f 55#include "quota.h"
8dec98ed 56#include "refcounttree.h"
ccd979bd
MF
57
58#include "buffer_head_io.h"
59
60struct ocfs2_mask_waiter {
61 struct list_head mw_item;
62 int mw_status;
63 struct completion mw_complete;
64 unsigned long mw_mask;
65 unsigned long mw_goal;
8ddb7b00
SM
66#ifdef CONFIG_OCFS2_FS_STATS
67 unsigned long long mw_lock_start;
68#endif
ccd979bd
MF
69};
70
54a7e755
MF
71static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
72static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
cf8e06f1 73static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
9e33d69f 74static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
ccd979bd 75
d680efe9 76/*
cc567d89 77 * Return value from ->downconvert_worker functions.
d680efe9 78 *
b5e500e2 79 * These control the precise actions of ocfs2_unblock_lock()
d680efe9
MF
80 * and ocfs2_process_blocked_lock()
81 *
82 */
83enum ocfs2_unblock_action {
84 UNBLOCK_CONTINUE = 0, /* Continue downconvert */
85 UNBLOCK_CONTINUE_POST = 1, /* Continue downconvert, fire
86 * ->post_unlock callback */
87 UNBLOCK_STOP_POST = 2, /* Do not downconvert, fire
88 * ->post_unlock() callback. */
89};
90
91struct ocfs2_unblock_ctl {
92 int requeue;
93 enum ocfs2_unblock_action unblock_action;
94};
95
cb25797d
JK
96/* Lockdep class keys */
97struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
98
810d5aeb
MF
99static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
100 int new_level);
101static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres);
102
cc567d89
MF
103static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
104 int blocking);
105
cc567d89
MF
106static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
107 int blocking);
d680efe9
MF
108
109static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
110 struct ocfs2_lock_res *lockres);
ccd979bd 111
9e33d69f 112static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
6cb129f5 113
8dec98ed
TM
114static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
115 int new_level);
116static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
117 int blocking);
118
6cb129f5
AB
119#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
120
121/* This aids in debugging situations where a bad LVB might be involved. */
122static void ocfs2_dump_meta_lvb_info(u64 level,
123 const char *function,
124 unsigned int line,
125 struct ocfs2_lock_res *lockres)
126{
a641dc2a 127 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
6cb129f5
AB
128
129 mlog(level, "LVB information for %s (called from %s:%u):\n",
130 lockres->l_name, function, line);
131 mlog(level, "version: %u, clusters: %u, generation: 0x%x\n",
132 lvb->lvb_version, be32_to_cpu(lvb->lvb_iclusters),
133 be32_to_cpu(lvb->lvb_igeneration));
134 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
135 (unsigned long long)be64_to_cpu(lvb->lvb_isize),
136 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
137 be16_to_cpu(lvb->lvb_imode));
138 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
139 "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
140 (long long)be64_to_cpu(lvb->lvb_iatime_packed),
141 (long long)be64_to_cpu(lvb->lvb_ictime_packed),
142 (long long)be64_to_cpu(lvb->lvb_imtime_packed),
143 be32_to_cpu(lvb->lvb_iattr));
144}
145
146
f625c979
MF
147/*
148 * OCFS2 Lock Resource Operations
149 *
150 * These fine tune the behavior of the generic dlmglue locking infrastructure.
0d5dc6c2
MF
151 *
152 * The most basic of lock types can point ->l_priv to their respective
153 * struct ocfs2_super and allow the default actions to manage things.
154 *
155 * Right now, each lock type also needs to implement an init function,
156 * and trivial lock/unlock wrappers. ocfs2_simple_drop_lockres()
157 * should be called when the lock is no longer needed (i.e., object
158 * destruction time).
f625c979 159 */
ccd979bd 160struct ocfs2_lock_res_ops {
54a7e755
MF
161 /*
162 * Translate an ocfs2_lock_res * into an ocfs2_super *. Define
163 * this callback if ->l_priv is not an ocfs2_super pointer
164 */
165 struct ocfs2_super * (*get_osb)(struct ocfs2_lock_res *);
b5e500e2 166
0d5dc6c2 167 /*
34d024f8
MF
168 * Optionally called in the downconvert thread after a
169 * successful downconvert. The lockres will not be referenced
170 * after this callback is called, so it is safe to free
171 * memory, etc.
0d5dc6c2
MF
172 *
173 * The exact semantics of when this is called are controlled
174 * by ->downconvert_worker()
175 */
d680efe9 176 void (*post_unlock)(struct ocfs2_super *, struct ocfs2_lock_res *);
f625c979 177
16d5b956
MF
178 /*
179 * Allow a lock type to add checks to determine whether it is
180 * safe to downconvert a lock. Return 0 to re-queue the
181 * downconvert at a later time, nonzero to continue.
182 *
183 * For most locks, the default checks that there are no
184 * incompatible holders are sufficient.
185 *
186 * Called with the lockres spinlock held.
187 */
188 int (*check_downconvert)(struct ocfs2_lock_res *, int);
189
5ef0d4ea
MF
190 /*
191 * Allows a lock type to populate the lock value block. This
192 * is called on downconvert, and when we drop a lock.
193 *
194 * Locks that want to use this should set LOCK_TYPE_USES_LVB
195 * in the flags field.
196 *
197 * Called with the lockres spinlock held.
198 */
199 void (*set_lvb)(struct ocfs2_lock_res *);
200
cc567d89
MF
201 /*
202 * Called from the downconvert thread when it is determined
203 * that a lock will be downconverted. This is called without
204 * any locks held so the function can do work that might
205 * schedule (syncing out data, etc).
206 *
207 * This should return any one of the ocfs2_unblock_action
208 * values, depending on what it wants the thread to do.
209 */
210 int (*downconvert_worker)(struct ocfs2_lock_res *, int);
211
f625c979
MF
212 /*
213 * LOCK_TYPE_* flags which describe the specific requirements
214 * of a lock type. Descriptions of each individual flag follow.
215 */
216 int flags;
ccd979bd
MF
217};
218
f625c979
MF
219/*
220 * Some locks want to "refresh" potentially stale data when a
221 * meaningful (PRMODE or EXMODE) lock level is first obtained. If this
222 * flag is set, the OCFS2_LOCK_NEEDS_REFRESH flag will be set on the
223 * individual lockres l_flags member from the ast function. It is
224 * expected that the locking wrapper will clear the
225 * OCFS2_LOCK_NEEDS_REFRESH flag when done.
226 */
227#define LOCK_TYPE_REQUIRES_REFRESH 0x1
228
b80fc012 229/*
5ef0d4ea
MF
230 * Indicate that a lock type makes use of the lock value block. The
231 * ->set_lvb lock type callback must be defined.
b80fc012
MF
232 */
233#define LOCK_TYPE_USES_LVB 0x2
234
ccd979bd 235static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
54a7e755 236 .get_osb = ocfs2_get_inode_osb,
f625c979 237 .flags = 0,
ccd979bd
MF
238};
239
e63aecb6 240static struct ocfs2_lock_res_ops ocfs2_inode_inode_lops = {
54a7e755 241 .get_osb = ocfs2_get_inode_osb,
810d5aeb
MF
242 .check_downconvert = ocfs2_check_meta_downconvert,
243 .set_lvb = ocfs2_set_meta_lvb,
f1f54068 244 .downconvert_worker = ocfs2_data_convert_worker,
b80fc012 245 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
ccd979bd
MF
246};
247
ccd979bd 248static struct ocfs2_lock_res_ops ocfs2_super_lops = {
f625c979 249 .flags = LOCK_TYPE_REQUIRES_REFRESH,
ccd979bd
MF
250};
251
252static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
f625c979 253 .flags = 0,
ccd979bd
MF
254};
255
6ca497a8 256static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
257 .flags = 0,
258};
259
83273932
SE
260static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
261 .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
262};
263
d680efe9 264static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
54a7e755 265 .get_osb = ocfs2_get_dentry_osb,
d680efe9 266 .post_unlock = ocfs2_dentry_post_unlock,
cc567d89 267 .downconvert_worker = ocfs2_dentry_convert_worker,
f625c979 268 .flags = 0,
d680efe9
MF
269};
270
50008630
TY
271static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
272 .get_osb = ocfs2_get_inode_osb,
273 .flags = 0,
274};
275
cf8e06f1
MF
276static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
277 .get_osb = ocfs2_get_file_osb,
278 .flags = 0,
279};
280
9e33d69f
JK
281static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
282 .set_lvb = ocfs2_set_qinfo_lvb,
283 .get_osb = ocfs2_get_qinfo_osb,
284 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
285};
286
8dec98ed
TM
287static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
288 .check_downconvert = ocfs2_check_refcount_downconvert,
289 .downconvert_worker = ocfs2_refcount_convert_worker,
290 .flags = 0,
291};
292
ccd979bd
MF
293static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
294{
295 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
50008630
TY
296 lockres->l_type == OCFS2_LOCK_TYPE_RW ||
297 lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
ccd979bd
MF
298}
299
c0e41338 300static inline struct ocfs2_lock_res *ocfs2_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
a796d286
JB
301{
302 return container_of(lksb, struct ocfs2_lock_res, l_lksb);
303}
304
ccd979bd
MF
305static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
306{
307 BUG_ON(!ocfs2_is_inode_lock(lockres));
308
309 return (struct inode *) lockres->l_priv;
310}
311
d680efe9
MF
312static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res *lockres)
313{
314 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_DENTRY);
315
316 return (struct ocfs2_dentry_lock *)lockres->l_priv;
317}
318
9e33d69f
JK
319static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
320{
321 BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
322
323 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
324}
325
8dec98ed
TM
326static inline struct ocfs2_refcount_tree *
327ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
328{
329 return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
330}
331
54a7e755
MF
332static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
333{
334 if (lockres->l_ops->get_osb)
335 return lockres->l_ops->get_osb(lockres);
336
337 return (struct ocfs2_super *)lockres->l_priv;
338}
339
ccd979bd
MF
340static int ocfs2_lock_create(struct ocfs2_super *osb,
341 struct ocfs2_lock_res *lockres,
342 int level,
bd3e7610 343 u32 dlm_flags);
ccd979bd
MF
344static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
345 int wanted);
cb25797d
JK
346static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
347 struct ocfs2_lock_res *lockres,
348 int level, unsigned long caller_ip);
349static inline void ocfs2_cluster_unlock(struct ocfs2_super *osb,
350 struct ocfs2_lock_res *lockres,
351 int level)
352{
353 __ocfs2_cluster_unlock(osb, lockres, level, _RET_IP_);
354}
355
ccd979bd
MF
356static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
357static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
358static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
359static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
360static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
361 struct ocfs2_lock_res *lockres);
362static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
363 int convert);
c74ff8bb
SM
364#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
365 if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \
366 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
367 _err, _func, _lockres->l_name); \
368 else \
369 mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \
370 _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \
371 (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \
ccd979bd 372} while (0)
34d024f8
MF
373static int ocfs2_downconvert_thread(void *arg);
374static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
375 struct ocfs2_lock_res *lockres);
e63aecb6 376static int ocfs2_inode_lock_update(struct inode *inode,
ccd979bd
MF
377 struct buffer_head **bh);
378static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
379static inline int ocfs2_highest_compat_lock_level(int level);
de551246
JB
380static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
381 int new_level);
cf8e06f1
MF
382static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
383 struct ocfs2_lock_res *lockres,
384 int new_level,
de551246
JB
385 int lvb,
386 unsigned int generation);
cf8e06f1
MF
387static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
388 struct ocfs2_lock_res *lockres);
389static int ocfs2_cancel_convert(struct ocfs2_super *osb,
390 struct ocfs2_lock_res *lockres);
391
ccd979bd 392
ccd979bd
MF
393static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
394 u64 blkno,
395 u32 generation,
396 char *name)
397{
398 int len;
399
400 mlog_entry_void();
401
402 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
403
b0697053
MF
404 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
405 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
406 (long long)blkno, generation);
ccd979bd
MF
407
408 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
409
410 mlog(0, "built lock resource with name: %s\n", name);
411
412 mlog_exit_void();
413}
414
34af946a 415static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
ccd979bd
MF
416
417static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
418 struct ocfs2_dlm_debug *dlm_debug)
419{
420 mlog(0, "Add tracking for lockres %s\n", res->l_name);
421
422 spin_lock(&ocfs2_dlm_tracking_lock);
423 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
424 spin_unlock(&ocfs2_dlm_tracking_lock);
425}
426
427static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
428{
429 spin_lock(&ocfs2_dlm_tracking_lock);
430 if (!list_empty(&res->l_debug_list))
431 list_del_init(&res->l_debug_list);
432 spin_unlock(&ocfs2_dlm_tracking_lock);
433}
434
8ddb7b00
SM
435#ifdef CONFIG_OCFS2_FS_STATS
436static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
437{
438 res->l_lock_num_prmode = 0;
439 res->l_lock_num_prmode_failed = 0;
440 res->l_lock_total_prmode = 0;
441 res->l_lock_max_prmode = 0;
442 res->l_lock_num_exmode = 0;
443 res->l_lock_num_exmode_failed = 0;
444 res->l_lock_total_exmode = 0;
445 res->l_lock_max_exmode = 0;
446 res->l_lock_refresh = 0;
447}
448
449static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
450 struct ocfs2_mask_waiter *mw, int ret)
451{
452 unsigned long long *num, *sum;
453 unsigned int *max, *failed;
454 struct timespec ts = current_kernel_time();
455 unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
456
457 if (level == LKM_PRMODE) {
458 num = &res->l_lock_num_prmode;
459 sum = &res->l_lock_total_prmode;
460 max = &res->l_lock_max_prmode;
461 failed = &res->l_lock_num_prmode_failed;
462 } else if (level == LKM_EXMODE) {
463 num = &res->l_lock_num_exmode;
464 sum = &res->l_lock_total_exmode;
465 max = &res->l_lock_max_exmode;
466 failed = &res->l_lock_num_exmode_failed;
467 } else
468 return;
469
470 (*num)++;
471 (*sum) += time;
472 if (time > *max)
473 *max = time;
474 if (ret)
475 (*failed)++;
476}
477
478static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
479{
480 lockres->l_lock_refresh++;
481}
482
483static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
484{
485 struct timespec ts = current_kernel_time();
486 mw->mw_lock_start = timespec_to_ns(&ts);
487}
488#else
489static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
490{
491}
492static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
493 int level, struct ocfs2_mask_waiter *mw, int ret)
494{
495}
496static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
497{
498}
499static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
500{
501}
502#endif
503
ccd979bd
MF
504static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
505 struct ocfs2_lock_res *res,
506 enum ocfs2_lock_type type,
ccd979bd
MF
507 struct ocfs2_lock_res_ops *ops,
508 void *priv)
509{
ccd979bd
MF
510 res->l_type = type;
511 res->l_ops = ops;
512 res->l_priv = priv;
513
bd3e7610
JB
514 res->l_level = DLM_LOCK_IV;
515 res->l_requested = DLM_LOCK_IV;
516 res->l_blocking = DLM_LOCK_IV;
ccd979bd
MF
517 res->l_action = OCFS2_AST_INVALID;
518 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
519
520 res->l_flags = OCFS2_LOCK_INITIALIZED;
521
522 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
8ddb7b00
SM
523
524 ocfs2_init_lock_stats(res);
cb25797d
JK
525#ifdef CONFIG_DEBUG_LOCK_ALLOC
526 if (type != OCFS2_LOCK_TYPE_OPEN)
527 lockdep_init_map(&res->l_lockdep_map, ocfs2_lock_type_strings[type],
528 &lockdep_keys[type], 0);
529 else
530 res->l_lockdep_map.key = NULL;
531#endif
ccd979bd
MF
532}
533
534void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
535{
536 /* This also clears out the lock status block */
537 memset(res, 0, sizeof(struct ocfs2_lock_res));
538 spin_lock_init(&res->l_lock);
539 init_waitqueue_head(&res->l_event);
540 INIT_LIST_HEAD(&res->l_blocked_list);
541 INIT_LIST_HEAD(&res->l_mask_waiters);
542}
543
544void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
545 enum ocfs2_lock_type type,
24c19ef4 546 unsigned int generation,
ccd979bd
MF
547 struct inode *inode)
548{
549 struct ocfs2_lock_res_ops *ops;
550
551 switch(type) {
552 case OCFS2_LOCK_TYPE_RW:
553 ops = &ocfs2_inode_rw_lops;
554 break;
555 case OCFS2_LOCK_TYPE_META:
e63aecb6 556 ops = &ocfs2_inode_inode_lops;
ccd979bd 557 break;
50008630
TY
558 case OCFS2_LOCK_TYPE_OPEN:
559 ops = &ocfs2_inode_open_lops;
560 break;
ccd979bd
MF
561 default:
562 mlog_bug_on_msg(1, "type: %d\n", type);
563 ops = NULL; /* thanks, gcc */
564 break;
565 };
566
d680efe9 567 ocfs2_build_lock_name(type, OCFS2_I(inode)->ip_blkno,
24c19ef4 568 generation, res->l_name);
d680efe9
MF
569 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, ops, inode);
570}
571
54a7e755
MF
572static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
573{
574 struct inode *inode = ocfs2_lock_res_inode(lockres);
575
576 return OCFS2_SB(inode->i_sb);
577}
578
9e33d69f
JK
579static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
580{
581 struct ocfs2_mem_dqinfo *info = lockres->l_priv;
582
583 return OCFS2_SB(info->dqi_gi.dqi_sb);
584}
585
cf8e06f1
MF
586static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
587{
588 struct ocfs2_file_private *fp = lockres->l_priv;
589
590 return OCFS2_SB(fp->fp_file->f_mapping->host->i_sb);
591}
592
d680efe9
MF
593static __u64 ocfs2_get_dentry_lock_ino(struct ocfs2_lock_res *lockres)
594{
595 __be64 inode_blkno_be;
596
597 memcpy(&inode_blkno_be, &lockres->l_name[OCFS2_DENTRY_LOCK_INO_START],
598 sizeof(__be64));
599
600 return be64_to_cpu(inode_blkno_be);
601}
602
54a7e755
MF
603static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres)
604{
605 struct ocfs2_dentry_lock *dl = lockres->l_priv;
606
607 return OCFS2_SB(dl->dl_inode->i_sb);
608}
609
d680efe9
MF
610void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
611 u64 parent, struct inode *inode)
612{
613 int len;
614 u64 inode_blkno = OCFS2_I(inode)->ip_blkno;
615 __be64 inode_blkno_be = cpu_to_be64(inode_blkno);
616 struct ocfs2_lock_res *lockres = &dl->dl_lockres;
617
618 ocfs2_lock_res_init_once(lockres);
619
620 /*
621 * Unfortunately, the standard lock naming scheme won't work
622 * here because we have two 16 byte values to use. Instead,
623 * we'll stuff the inode number as a binary value. We still
624 * want error prints to show something without garbling the
625 * display, so drop a null byte in there before the inode
626 * number. A future version of OCFS2 will likely use all
627 * binary lock names. The stringified names have been a
628 * tremendous aid in debugging, but now that the debugfs
629 * interface exists, we can mangle things there if need be.
630 *
631 * NOTE: We also drop the standard "pad" value (the total lock
632 * name size stays the same though - the last part is all
633 * zeros due to the memset in ocfs2_lock_res_init_once()
634 */
635 len = snprintf(lockres->l_name, OCFS2_DENTRY_LOCK_INO_START,
636 "%c%016llx",
637 ocfs2_lock_type_char(OCFS2_LOCK_TYPE_DENTRY),
638 (long long)parent);
639
640 BUG_ON(len != (OCFS2_DENTRY_LOCK_INO_START - 1));
641
642 memcpy(&lockres->l_name[OCFS2_DENTRY_LOCK_INO_START], &inode_blkno_be,
643 sizeof(__be64));
644
645 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
646 OCFS2_LOCK_TYPE_DENTRY, &ocfs2_dentry_lops,
647 dl);
ccd979bd
MF
648}
649
650static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
651 struct ocfs2_super *osb)
652{
653 /* Superblock lockres doesn't come from a slab so we call init
654 * once on it manually. */
655 ocfs2_lock_res_init_once(res);
d680efe9
MF
656 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_SUPER, OCFS2_SUPER_BLOCK_BLKNO,
657 0, res->l_name);
ccd979bd 658 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
ccd979bd
MF
659 &ocfs2_super_lops, osb);
660}
661
662static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
663 struct ocfs2_super *osb)
664{
665 /* Rename lockres doesn't come from a slab so we call init
666 * once on it manually. */
667 ocfs2_lock_res_init_once(res);
d680efe9
MF
668 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_RENAME, 0, 0, res->l_name);
669 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME,
ccd979bd
MF
670 &ocfs2_rename_lops, osb);
671}
672
6ca497a8 673static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
674 struct ocfs2_super *osb)
675{
676 /* nfs_sync lockres doesn't come from a slab so we call init
677 * once on it manually. */
678 ocfs2_lock_res_init_once(res);
679 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_NFS_SYNC, 0, 0, res->l_name);
680 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_NFS_SYNC,
681 &ocfs2_nfs_sync_lops, osb);
682}
683
83273932
SE
684static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
685 struct ocfs2_super *osb)
686{
83273932
SE
687 ocfs2_lock_res_init_once(res);
688 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_ORPHAN_SCAN, 0, 0, res->l_name);
689 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_ORPHAN_SCAN,
690 &ocfs2_orphan_scan_lops, osb);
83273932
SE
691}
692
cf8e06f1
MF
693void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
694 struct ocfs2_file_private *fp)
695{
696 struct inode *inode = fp->fp_file->f_mapping->host;
697 struct ocfs2_inode_info *oi = OCFS2_I(inode);
698
699 ocfs2_lock_res_init_once(lockres);
700 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_FLOCK, oi->ip_blkno,
701 inode->i_generation, lockres->l_name);
702 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), lockres,
703 OCFS2_LOCK_TYPE_FLOCK, &ocfs2_flock_lops,
704 fp);
705 lockres->l_flags |= OCFS2_LOCK_NOCACHE;
706}
707
9e33d69f
JK
708void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
709 struct ocfs2_mem_dqinfo *info)
710{
711 ocfs2_lock_res_init_once(lockres);
712 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
713 0, lockres->l_name);
714 ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
715 OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
716 info);
717}
718
8dec98ed
TM
719void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
720 struct ocfs2_super *osb, u64 ref_blkno,
721 unsigned int generation)
722{
723 ocfs2_lock_res_init_once(lockres);
724 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
725 generation, lockres->l_name);
726 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
727 &ocfs2_refcount_block_lops, osb);
728}
729
ccd979bd
MF
730void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
731{
732 mlog_entry_void();
733
734 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
735 return;
736
737 ocfs2_remove_lockres_tracking(res);
738
739 mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
740 "Lockres %s is on the blocked list\n",
741 res->l_name);
742 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
743 "Lockres %s has mask waiters pending\n",
744 res->l_name);
745 mlog_bug_on_msg(spin_is_locked(&res->l_lock),
746 "Lockres %s is locked\n",
747 res->l_name);
748 mlog_bug_on_msg(res->l_ro_holders,
749 "Lockres %s has %u ro holders\n",
750 res->l_name, res->l_ro_holders);
751 mlog_bug_on_msg(res->l_ex_holders,
752 "Lockres %s has %u ex holders\n",
753 res->l_name, res->l_ex_holders);
754
755 /* Need to clear out the lock status block for the dlm */
756 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
757
758 res->l_flags = 0UL;
759 mlog_exit_void();
760}
761
762static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
763 int level)
764{
765 mlog_entry_void();
766
767 BUG_ON(!lockres);
768
769 switch(level) {
bd3e7610 770 case DLM_LOCK_EX:
ccd979bd
MF
771 lockres->l_ex_holders++;
772 break;
bd3e7610 773 case DLM_LOCK_PR:
ccd979bd
MF
774 lockres->l_ro_holders++;
775 break;
776 default:
777 BUG();
778 }
779
780 mlog_exit_void();
781}
782
783static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
784 int level)
785{
786 mlog_entry_void();
787
788 BUG_ON(!lockres);
789
790 switch(level) {
bd3e7610 791 case DLM_LOCK_EX:
ccd979bd
MF
792 BUG_ON(!lockres->l_ex_holders);
793 lockres->l_ex_holders--;
794 break;
bd3e7610 795 case DLM_LOCK_PR:
ccd979bd
MF
796 BUG_ON(!lockres->l_ro_holders);
797 lockres->l_ro_holders--;
798 break;
799 default:
800 BUG();
801 }
802 mlog_exit_void();
803}
804
805/* WARNING: This function lives in a world where the only three lock
806 * levels are EX, PR, and NL. It *will* have to be adjusted when more
807 * lock types are added. */
808static inline int ocfs2_highest_compat_lock_level(int level)
809{
bd3e7610 810 int new_level = DLM_LOCK_EX;
ccd979bd 811
bd3e7610
JB
812 if (level == DLM_LOCK_EX)
813 new_level = DLM_LOCK_NL;
814 else if (level == DLM_LOCK_PR)
815 new_level = DLM_LOCK_PR;
ccd979bd
MF
816 return new_level;
817}
818
819static void lockres_set_flags(struct ocfs2_lock_res *lockres,
820 unsigned long newflags)
821{
800deef3 822 struct ocfs2_mask_waiter *mw, *tmp;
ccd979bd
MF
823
824 assert_spin_locked(&lockres->l_lock);
825
826 lockres->l_flags = newflags;
827
800deef3 828 list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
ccd979bd
MF
829 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
830 continue;
831
832 list_del_init(&mw->mw_item);
833 mw->mw_status = 0;
834 complete(&mw->mw_complete);
835 }
836}
837static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
838{
839 lockres_set_flags(lockres, lockres->l_flags | or);
840}
841static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
842 unsigned long clear)
843{
844 lockres_set_flags(lockres, lockres->l_flags & ~clear);
845}
846
847static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
848{
849 mlog_entry_void();
850
851 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
852 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
853 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
bd3e7610 854 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
ccd979bd
MF
855
856 lockres->l_level = lockres->l_requested;
857 if (lockres->l_level <=
858 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
bd3e7610 859 lockres->l_blocking = DLM_LOCK_NL;
ccd979bd
MF
860 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
861 }
862 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
863
864 mlog_exit_void();
865}
866
867static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
868{
869 mlog_entry_void();
870
871 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
872 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
873
874 /* Convert from RO to EX doesn't really need anything as our
875 * information is already up to data. Convert from NL to
876 * *anything* however should mark ourselves as needing an
877 * update */
bd3e7610 878 if (lockres->l_level == DLM_LOCK_NL &&
f625c979 879 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
ccd979bd
MF
880 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
881
882 lockres->l_level = lockres->l_requested;
a1912826
SM
883
884 /*
885 * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing
886 * the OCFS2_LOCK_BUSY flag to prevent the dc thread from
887 * downconverting the lock before the upconvert has fully completed.
888 */
889 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
890
ccd979bd
MF
891 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
892
893 mlog_exit_void();
894}
895
896static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
897{
898 mlog_entry_void();
899
3cf0c507 900 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
ccd979bd
MF
901 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
902
bd3e7610 903 if (lockres->l_requested > DLM_LOCK_NL &&
f625c979
MF
904 !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
905 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
ccd979bd
MF
906 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
907
908 lockres->l_level = lockres->l_requested;
909 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
910 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
911
912 mlog_exit_void();
913}
914
ccd979bd
MF
915static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
916 int level)
917{
918 int needs_downconvert = 0;
919 mlog_entry_void();
920
921 assert_spin_locked(&lockres->l_lock);
922
ccd979bd
MF
923 if (level > lockres->l_blocking) {
924 /* only schedule a downconvert if we haven't already scheduled
925 * one that goes low enough to satisfy the level we're
926 * blocking. this also catches the case where we get
927 * duplicate BASTs */
928 if (ocfs2_highest_compat_lock_level(level) <
929 ocfs2_highest_compat_lock_level(lockres->l_blocking))
930 needs_downconvert = 1;
931
932 lockres->l_blocking = level;
933 }
934
0b94a909
WW
935 if (needs_downconvert)
936 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
937
ccd979bd
MF
938 mlog_exit(needs_downconvert);
939 return needs_downconvert;
940}
941
de551246
JB
942/*
943 * OCFS2_LOCK_PENDING and l_pending_gen.
944 *
945 * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
946 * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
947 * for more details on the race.
948 *
949 * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
950 * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
951 * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
952 * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
953 * the caller is going to try to clear PENDING again. If nothing else is
954 * happening, __lockres_clear_pending() sees PENDING is unset and does
955 * nothing.
956 *
957 * But what if another path (eg downconvert thread) has just started a
958 * new locking action? The other path has re-set PENDING. Our path
959 * cannot clear PENDING, because that will re-open the original race
960 * window.
961 *
962 * [Example]
963 *
964 * ocfs2_meta_lock()
965 * ocfs2_cluster_lock()
966 * set BUSY
967 * set PENDING
968 * drop l_lock
969 * ocfs2_dlm_lock()
970 * ocfs2_locking_ast() ocfs2_downconvert_thread()
971 * clear PENDING ocfs2_unblock_lock()
972 * take_l_lock
973 * !BUSY
974 * ocfs2_prepare_downconvert()
975 * set BUSY
976 * set PENDING
977 * drop l_lock
978 * take l_lock
979 * clear PENDING
980 * drop l_lock
981 * <window>
982 * ocfs2_dlm_lock()
983 *
984 * So as you can see, we now have a window where l_lock is not held,
985 * PENDING is not set, and ocfs2_dlm_lock() has not been called.
986 *
987 * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
988 * set by ocfs2_prepare_downconvert(). That wasn't nice.
989 *
990 * To solve this we introduce l_pending_gen. A call to
991 * lockres_clear_pending() will only do so when it is passed a generation
992 * number that matches the lockres. lockres_set_pending() will return the
993 * current generation number. When ocfs2_cluster_lock() goes to clear
994 * PENDING, it passes the generation it got from set_pending(). In our
995 * example above, the generation numbers will *not* match. Thus,
996 * ocfs2_cluster_lock() will not clear the PENDING set by
997 * ocfs2_prepare_downconvert().
998 */
999
1000/* Unlocked version for ocfs2_locking_ast() */
1001static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
1002 unsigned int generation,
1003 struct ocfs2_super *osb)
1004{
1005 assert_spin_locked(&lockres->l_lock);
1006
1007 /*
1008 * The ast and locking functions can race us here. The winner
1009 * will clear pending, the loser will not.
1010 */
1011 if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
1012 (lockres->l_pending_gen != generation))
1013 return;
1014
1015 lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
1016 lockres->l_pending_gen++;
1017
1018 /*
1019 * The downconvert thread may have skipped us because we
1020 * were PENDING. Wake it up.
1021 */
1022 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1023 ocfs2_wake_downconvert_thread(osb);
1024}
1025
1026/* Locked version for callers of ocfs2_dlm_lock() */
1027static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
1028 unsigned int generation,
1029 struct ocfs2_super *osb)
1030{
1031 unsigned long flags;
1032
1033 spin_lock_irqsave(&lockres->l_lock, flags);
1034 __lockres_clear_pending(lockres, generation, osb);
1035 spin_unlock_irqrestore(&lockres->l_lock, flags);
1036}
1037
1038static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
1039{
1040 assert_spin_locked(&lockres->l_lock);
1041 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
1042
1043 lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
1044
1045 return lockres->l_pending_gen;
1046}
1047
c0e41338 1048static void ocfs2_blocking_ast(struct ocfs2_dlm_lksb *lksb, int level)
ccd979bd 1049{
a796d286 1050 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
aa2623ad 1051 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
ccd979bd
MF
1052 int needs_downconvert;
1053 unsigned long flags;
1054
bd3e7610 1055 BUG_ON(level <= DLM_LOCK_NL);
ccd979bd 1056
aa2623ad
MF
1057 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
1058 lockres->l_name, level, lockres->l_level,
1059 ocfs2_lock_type_string(lockres->l_type));
1060
cf8e06f1
MF
1061 /*
1062 * We can skip the bast for locks which don't enable caching -
1063 * they'll be dropped at the earliest possible time anyway.
1064 */
1065 if (lockres->l_flags & OCFS2_LOCK_NOCACHE)
1066 return;
1067
ccd979bd
MF
1068 spin_lock_irqsave(&lockres->l_lock, flags);
1069 needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
1070 if (needs_downconvert)
1071 ocfs2_schedule_blocked_lock(osb, lockres);
1072 spin_unlock_irqrestore(&lockres->l_lock, flags);
1073
d680efe9
MF
1074 wake_up(&lockres->l_event);
1075
34d024f8 1076 ocfs2_wake_downconvert_thread(osb);
ccd979bd
MF
1077}
1078
c0e41338 1079static void ocfs2_locking_ast(struct ocfs2_dlm_lksb *lksb)
ccd979bd 1080{
a796d286 1081 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
de551246 1082 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
ccd979bd 1083 unsigned long flags;
1693a5c0 1084 int status;
ccd979bd
MF
1085
1086 spin_lock_irqsave(&lockres->l_lock, flags);
1087
1693a5c0
DT
1088 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
1089
1090 if (status == -EAGAIN) {
1091 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1092 goto out;
1093 }
1094
1095 if (status) {
8f2c9c1b 1096 mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
1693a5c0 1097 lockres->l_name, status);
ccd979bd
MF
1098 spin_unlock_irqrestore(&lockres->l_lock, flags);
1099 return;
1100 }
1101
1102 switch(lockres->l_action) {
1103 case OCFS2_AST_ATTACH:
1104 ocfs2_generic_handle_attach_action(lockres);
e92d57df 1105 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
ccd979bd
MF
1106 break;
1107 case OCFS2_AST_CONVERT:
1108 ocfs2_generic_handle_convert_action(lockres);
1109 break;
1110 case OCFS2_AST_DOWNCONVERT:
1111 ocfs2_generic_handle_downconvert_action(lockres);
1112 break;
1113 default:
e92d57df
MF
1114 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
1115 "lockres flags = 0x%lx, unlock action: %u\n",
1116 lockres->l_name, lockres->l_action, lockres->l_flags,
1117 lockres->l_unlock_action);
ccd979bd
MF
1118 BUG();
1119 }
1693a5c0 1120out:
ccd979bd
MF
1121 /* set it to something invalid so if we get called again we
1122 * can catch it. */
1123 lockres->l_action = OCFS2_AST_INVALID;
ccd979bd 1124
de551246
JB
1125 /* Did we try to cancel this lock? Clear that state */
1126 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
1127 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1128
1129 /*
1130 * We may have beaten the locking functions here. We certainly
1131 * know that dlm_lock() has been called :-)
1132 * Because we can't have two lock calls in flight at once, we
1133 * can use lockres->l_pending_gen.
1134 */
1135 __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
1136
ccd979bd 1137 wake_up(&lockres->l_event);
d680efe9 1138 spin_unlock_irqrestore(&lockres->l_lock, flags);
ccd979bd
MF
1139}
1140
553b5eb9
JB
1141static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1142{
1143 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1144 unsigned long flags;
1145
1146 mlog_entry_void();
1147
1148 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
1149 lockres->l_unlock_action);
1150
1151 spin_lock_irqsave(&lockres->l_lock, flags);
1152 if (error) {
1153 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
1154 "unlock_action %d\n", error, lockres->l_name,
1155 lockres->l_unlock_action);
1156 spin_unlock_irqrestore(&lockres->l_lock, flags);
1157 mlog_exit_void();
1158 return;
1159 }
1160
1161 switch(lockres->l_unlock_action) {
1162 case OCFS2_UNLOCK_CANCEL_CONVERT:
1163 mlog(0, "Cancel convert success for %s\n", lockres->l_name);
1164 lockres->l_action = OCFS2_AST_INVALID;
1165 /* Downconvert thread may have requeued this lock, we
1166 * need to wake it. */
1167 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
1168 ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
1169 break;
1170 case OCFS2_UNLOCK_DROP_LOCK:
1171 lockres->l_level = DLM_LOCK_IV;
1172 break;
1173 default:
1174 BUG();
1175 }
1176
1177 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1178 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1179 wake_up(&lockres->l_event);
1180 spin_unlock_irqrestore(&lockres->l_lock, flags);
1181
1182 mlog_exit_void();
1183}
1184
1185/*
1186 * This is the filesystem locking protocol. It provides the lock handling
1187 * hooks for the underlying DLM. It has a maximum version number.
1188 * The version number allows interoperability with systems running at
1189 * the same major number and an equal or smaller minor number.
1190 *
1191 * Whenever the filesystem does new things with locks (adds or removes a
1192 * lock, orders them differently, does different things underneath a lock),
1193 * the version must be changed. The protocol is negotiated when joining
1194 * the dlm domain. A node may join the domain if its major version is
1195 * identical to all other nodes and its minor version is greater than
1196 * or equal to all other nodes. When its minor version is greater than
1197 * the other nodes, it will run at the minor version specified by the
1198 * other nodes.
1199 *
1200 * If a locking change is made that will not be compatible with older
1201 * versions, the major number must be increased and the minor version set
1202 * to zero. If a change merely adds a behavior that can be disabled when
1203 * speaking to older versions, the minor version must be increased. If a
1204 * change adds a fully backwards compatible change (eg, LVB changes that
1205 * are just ignored by older versions), the version does not need to be
1206 * updated.
1207 */
1208static struct ocfs2_locking_protocol lproto = {
1209 .lp_max_version = {
1210 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
1211 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
1212 },
1213 .lp_lock_ast = ocfs2_locking_ast,
1214 .lp_blocking_ast = ocfs2_blocking_ast,
1215 .lp_unlock_ast = ocfs2_unlock_ast,
1216};
1217
1218void ocfs2_set_locking_protocol(void)
1219{
1220 ocfs2_stack_glue_set_max_proto_version(&lproto.lp_max_version);
1221}
1222
ccd979bd
MF
1223static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1224 int convert)
1225{
1226 unsigned long flags;
1227
1228 mlog_entry_void();
1229 spin_lock_irqsave(&lockres->l_lock, flags);
1230 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
a1912826 1231 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
ccd979bd
MF
1232 if (convert)
1233 lockres->l_action = OCFS2_AST_INVALID;
1234 else
1235 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1236 spin_unlock_irqrestore(&lockres->l_lock, flags);
1237
1238 wake_up(&lockres->l_event);
1239 mlog_exit_void();
1240}
1241
1242/* Note: If we detect another process working on the lock (i.e.,
1243 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
1244 * to do the right thing in that case.
1245 */
1246static int ocfs2_lock_create(struct ocfs2_super *osb,
1247 struct ocfs2_lock_res *lockres,
1248 int level,
bd3e7610 1249 u32 dlm_flags)
ccd979bd
MF
1250{
1251 int ret = 0;
ccd979bd 1252 unsigned long flags;
de551246 1253 unsigned int gen;
ccd979bd
MF
1254
1255 mlog_entry_void();
1256
bd3e7610 1257 mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
ccd979bd
MF
1258 dlm_flags);
1259
1260 spin_lock_irqsave(&lockres->l_lock, flags);
1261 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
1262 (lockres->l_flags & OCFS2_LOCK_BUSY)) {
1263 spin_unlock_irqrestore(&lockres->l_lock, flags);
1264 goto bail;
1265 }
1266
1267 lockres->l_action = OCFS2_AST_ATTACH;
1268 lockres->l_requested = level;
1269 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
de551246 1270 gen = lockres_set_pending(lockres);
ccd979bd
MF
1271 spin_unlock_irqrestore(&lockres->l_lock, flags);
1272
4670c46d 1273 ret = ocfs2_dlm_lock(osb->cconn,
7431cd7e
JB
1274 level,
1275 &lockres->l_lksb,
1276 dlm_flags,
1277 lockres->l_name,
a796d286 1278 OCFS2_LOCK_ID_MAX_LEN - 1);
de551246 1279 lockres_clear_pending(lockres, gen, osb);
7431cd7e
JB
1280 if (ret) {
1281 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ccd979bd
MF
1282 ocfs2_recover_from_dlm_error(lockres, 1);
1283 }
1284
7431cd7e 1285 mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
ccd979bd
MF
1286
1287bail:
1288 mlog_exit(ret);
1289 return ret;
1290}
1291
1292static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
1293 int flag)
1294{
1295 unsigned long flags;
1296 int ret;
1297
1298 spin_lock_irqsave(&lockres->l_lock, flags);
1299 ret = lockres->l_flags & flag;
1300 spin_unlock_irqrestore(&lockres->l_lock, flags);
1301
1302 return ret;
1303}
1304
1305static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
1306
1307{
1308 wait_event(lockres->l_event,
1309 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
1310}
1311
1312static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
1313
1314{
1315 wait_event(lockres->l_event,
1316 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
1317}
1318
1319/* predict what lock level we'll be dropping down to on behalf
1320 * of another node, and return true if the currently wanted
1321 * level will be compatible with it. */
1322static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
1323 int wanted)
1324{
1325 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
1326
1327 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
1328}
1329
1330static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
1331{
1332 INIT_LIST_HEAD(&mw->mw_item);
1333 init_completion(&mw->mw_complete);
8ddb7b00 1334 ocfs2_init_start_time(mw);
ccd979bd
MF
1335}
1336
1337static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
1338{
1339 wait_for_completion(&mw->mw_complete);
1340 /* Re-arm the completion in case we want to wait on it again */
1341 INIT_COMPLETION(mw->mw_complete);
1342 return mw->mw_status;
1343}
1344
1345static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
1346 struct ocfs2_mask_waiter *mw,
1347 unsigned long mask,
1348 unsigned long goal)
1349{
1350 BUG_ON(!list_empty(&mw->mw_item));
1351
1352 assert_spin_locked(&lockres->l_lock);
1353
1354 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
1355 mw->mw_mask = mask;
1356 mw->mw_goal = goal;
1357}
1358
1359/* returns 0 if the mw that was removed was already satisfied, -EBUSY
1360 * if the mask still hadn't reached its goal */
1361static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
1362 struct ocfs2_mask_waiter *mw)
1363{
1364 unsigned long flags;
1365 int ret = 0;
1366
1367 spin_lock_irqsave(&lockres->l_lock, flags);
1368 if (!list_empty(&mw->mw_item)) {
1369 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
1370 ret = -EBUSY;
1371
1372 list_del_init(&mw->mw_item);
1373 init_completion(&mw->mw_complete);
1374 }
1375 spin_unlock_irqrestore(&lockres->l_lock, flags);
1376
1377 return ret;
1378
1379}
1380
cf8e06f1
MF
1381static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
1382 struct ocfs2_lock_res *lockres)
1383{
1384 int ret;
1385
1386 ret = wait_for_completion_interruptible(&mw->mw_complete);
1387 if (ret)
1388 lockres_remove_mask_waiter(lockres, mw);
1389 else
1390 ret = mw->mw_status;
1391 /* Re-arm the completion in case we want to wait on it again */
1392 INIT_COMPLETION(mw->mw_complete);
1393 return ret;
1394}
1395
cb25797d
JK
1396static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1397 struct ocfs2_lock_res *lockres,
1398 int level,
1399 u32 lkm_flags,
1400 int arg_flags,
1401 int l_subclass,
1402 unsigned long caller_ip)
ccd979bd
MF
1403{
1404 struct ocfs2_mask_waiter mw;
ccd979bd
MF
1405 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
1406 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
1407 unsigned long flags;
de551246 1408 unsigned int gen;
1693a5c0 1409 int noqueue_attempted = 0;
ccd979bd
MF
1410
1411 mlog_entry_void();
1412
1413 ocfs2_init_mask_waiter(&mw);
1414
b80fc012 1415 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
bd3e7610 1416 lkm_flags |= DLM_LKF_VALBLK;
b80fc012 1417
ccd979bd
MF
1418again:
1419 wait = 0;
1420
a1912826
SM
1421 spin_lock_irqsave(&lockres->l_lock, flags);
1422
ccd979bd
MF
1423 if (catch_signals && signal_pending(current)) {
1424 ret = -ERESTARTSYS;
a1912826 1425 goto unlock;
ccd979bd
MF
1426 }
1427
ccd979bd
MF
1428 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
1429 "Cluster lock called on freeing lockres %s! flags "
1430 "0x%lx\n", lockres->l_name, lockres->l_flags);
1431
1432 /* We only compare against the currently granted level
1433 * here. If the lock is blocked waiting on a downconvert,
1434 * we'll get caught below. */
1435 if (lockres->l_flags & OCFS2_LOCK_BUSY &&
1436 level > lockres->l_level) {
1437 /* is someone sitting in dlm_lock? If so, wait on
1438 * them. */
1439 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1440 wait = 1;
1441 goto unlock;
1442 }
1443
a1912826
SM
1444 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING) {
1445 /*
1446 * We've upconverted. If the lock now has a level we can
1447 * work with, we take it. If, however, the lock is not at the
1448 * required level, we go thru the full cycle. One way this could
1449 * happen is if a process requesting an upconvert to PR is
1450 * closely followed by another requesting upconvert to an EX.
1451 * If the process requesting EX lands here, we want it to
1452 * continue attempting to upconvert and let the process
1453 * requesting PR take the lock.
1454 * If multiple processes request upconvert to PR, the first one
1455 * here will take the lock. The others will have to go thru the
1456 * OCFS2_LOCK_BLOCKED check to ensure that there is no pending
1457 * downconvert request.
1458 */
1459 if (level <= lockres->l_level)
1460 goto update_holders;
1461 }
1462
ccd979bd
MF
1463 if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
1464 !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
1465 /* is the lock is currently blocked on behalf of
1466 * another node */
1467 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
1468 wait = 1;
1469 goto unlock;
1470 }
1471
1472 if (level > lockres->l_level) {
1693a5c0
DT
1473 if (noqueue_attempted > 0) {
1474 ret = -EAGAIN;
1475 goto unlock;
1476 }
1477 if (lkm_flags & DLM_LKF_NOQUEUE)
1478 noqueue_attempted = 1;
1479
ccd979bd
MF
1480 if (lockres->l_action != OCFS2_AST_INVALID)
1481 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1482 lockres->l_name, lockres->l_action);
1483
019d1b22
MF
1484 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1485 lockres->l_action = OCFS2_AST_ATTACH;
bd3e7610 1486 lkm_flags &= ~DLM_LKF_CONVERT;
019d1b22
MF
1487 } else {
1488 lockres->l_action = OCFS2_AST_CONVERT;
bd3e7610 1489 lkm_flags |= DLM_LKF_CONVERT;
019d1b22
MF
1490 }
1491
ccd979bd
MF
1492 lockres->l_requested = level;
1493 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
de551246 1494 gen = lockres_set_pending(lockres);
ccd979bd
MF
1495 spin_unlock_irqrestore(&lockres->l_lock, flags);
1496
bd3e7610
JB
1497 BUG_ON(level == DLM_LOCK_IV);
1498 BUG_ON(level == DLM_LOCK_NL);
ccd979bd
MF
1499
1500 mlog(0, "lock %s, convert from %d to level = %d\n",
1501 lockres->l_name, lockres->l_level, level);
1502
1503 /* call dlm_lock to upgrade lock now */
4670c46d 1504 ret = ocfs2_dlm_lock(osb->cconn,
7431cd7e
JB
1505 level,
1506 &lockres->l_lksb,
1507 lkm_flags,
1508 lockres->l_name,
a796d286 1509 OCFS2_LOCK_ID_MAX_LEN - 1);
de551246 1510 lockres_clear_pending(lockres, gen, osb);
7431cd7e
JB
1511 if (ret) {
1512 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
1513 (ret != -EAGAIN)) {
24ef1815 1514 ocfs2_log_dlm_error("ocfs2_dlm_lock",
7431cd7e 1515 ret, lockres);
ccd979bd
MF
1516 }
1517 ocfs2_recover_from_dlm_error(lockres, 1);
1518 goto out;
1519 }
1520
73ac36ea 1521 mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
ccd979bd
MF
1522 lockres->l_name);
1523
1524 /* At this point we've gone inside the dlm and need to
1525 * complete our work regardless. */
1526 catch_signals = 0;
1527
1528 /* wait for busy to clear and carry on */
1529 goto again;
1530 }
1531
a1912826 1532update_holders:
ccd979bd
MF
1533 /* Ok, if we get here then we're good to go. */
1534 ocfs2_inc_holders(lockres, level);
1535
1536 ret = 0;
1537unlock:
a1912826
SM
1538 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
1539
ccd979bd
MF
1540 spin_unlock_irqrestore(&lockres->l_lock, flags);
1541out:
1542 /*
1543 * This is helping work around a lock inversion between the page lock
1544 * and dlm locks. One path holds the page lock while calling aops
1545 * which block acquiring dlm locks. The voting thread holds dlm
1546 * locks while acquiring page locks while down converting data locks.
1547 * This block is helping an aop path notice the inversion and back
1548 * off to unlock its page lock before trying the dlm lock again.
1549 */
1550 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1551 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1552 wait = 0;
1553 if (lockres_remove_mask_waiter(lockres, &mw))
1554 ret = -EAGAIN;
1555 else
1556 goto again;
1557 }
1558 if (wait) {
1559 ret = ocfs2_wait_for_mask(&mw);
1560 if (ret == 0)
1561 goto again;
1562 mlog_errno(ret);
1563 }
8ddb7b00 1564 ocfs2_update_lock_stats(lockres, level, &mw, ret);
ccd979bd 1565
cb25797d
JK
1566#ifdef CONFIG_DEBUG_LOCK_ALLOC
1567 if (!ret && lockres->l_lockdep_map.key != NULL) {
1568 if (level == DLM_LOCK_PR)
1569 rwsem_acquire_read(&lockres->l_lockdep_map, l_subclass,
1570 !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
1571 caller_ip);
1572 else
1573 rwsem_acquire(&lockres->l_lockdep_map, l_subclass,
1574 !!(arg_flags & OCFS2_META_LOCK_NOQUEUE),
1575 caller_ip);
1576 }
1577#endif
ccd979bd
MF
1578 mlog_exit(ret);
1579 return ret;
1580}
1581
cb25797d
JK
1582static inline int ocfs2_cluster_lock(struct ocfs2_super *osb,
1583 struct ocfs2_lock_res *lockres,
1584 int level,
1585 u32 lkm_flags,
1586 int arg_flags)
1587{
1588 return __ocfs2_cluster_lock(osb, lockres, level, lkm_flags, arg_flags,
1589 0, _RET_IP_);
1590}
1591
1592
1593static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
1594 struct ocfs2_lock_res *lockres,
1595 int level,
1596 unsigned long caller_ip)
ccd979bd
MF
1597{
1598 unsigned long flags;
1599
1600 mlog_entry_void();
1601 spin_lock_irqsave(&lockres->l_lock, flags);
1602 ocfs2_dec_holders(lockres, level);
34d024f8 1603 ocfs2_downconvert_on_unlock(osb, lockres);
ccd979bd 1604 spin_unlock_irqrestore(&lockres->l_lock, flags);
cb25797d
JK
1605#ifdef CONFIG_DEBUG_LOCK_ALLOC
1606 if (lockres->l_lockdep_map.key != NULL)
1607 rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
1608#endif
ccd979bd
MF
1609 mlog_exit_void();
1610}
1611
da66116e
AB
1612static int ocfs2_create_new_lock(struct ocfs2_super *osb,
1613 struct ocfs2_lock_res *lockres,
1614 int ex,
1615 int local)
ccd979bd 1616{
bd3e7610 1617 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
ccd979bd 1618 unsigned long flags;
bd3e7610 1619 u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
ccd979bd
MF
1620
1621 spin_lock_irqsave(&lockres->l_lock, flags);
1622 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1623 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1624 spin_unlock_irqrestore(&lockres->l_lock, flags);
1625
24c19ef4 1626 return ocfs2_lock_create(osb, lockres, level, lkm_flags);
ccd979bd
MF
1627}
1628
1629/* Grants us an EX lock on the data and metadata resources, skipping
1630 * the normal cluster directory lookup. Use this ONLY on newly created
1631 * inodes which other nodes can't possibly see, and which haven't been
1632 * hashed in the inode hash yet. This can give us a good performance
1633 * increase as it'll skip the network broadcast normally associated
1634 * with creating a new lock resource. */
1635int ocfs2_create_new_inode_locks(struct inode *inode)
1636{
1637 int ret;
d680efe9 1638 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ccd979bd
MF
1639
1640 BUG_ON(!inode);
1641 BUG_ON(!ocfs2_inode_is_new(inode));
1642
1643 mlog_entry_void();
1644
b0697053 1645 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
ccd979bd
MF
1646
1647 /* NOTE: That we don't increment any of the holder counts, nor
1648 * do we add anything to a journal handle. Since this is
1649 * supposed to be a new inode which the cluster doesn't know
1650 * about yet, there is no need to. As far as the LVB handling
1651 * is concerned, this is basically like acquiring an EX lock
1652 * on a resource which has an invalid one -- we'll set it
1653 * valid when we release the EX. */
1654
24c19ef4 1655 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_rw_lockres, 1, 1);
ccd979bd
MF
1656 if (ret) {
1657 mlog_errno(ret);
1658 goto bail;
1659 }
1660
24c19ef4 1661 /*
bd3e7610 1662 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
24c19ef4
MF
1663 * don't use a generation in their lock names.
1664 */
e63aecb6 1665 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
ccd979bd
MF
1666 if (ret) {
1667 mlog_errno(ret);
1668 goto bail;
1669 }
1670
50008630
TY
1671 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
1672 if (ret) {
1673 mlog_errno(ret);
1674 goto bail;
1675 }
1676
ccd979bd
MF
1677bail:
1678 mlog_exit(ret);
1679 return ret;
1680}
1681
1682int ocfs2_rw_lock(struct inode *inode, int write)
1683{
1684 int status, level;
1685 struct ocfs2_lock_res *lockres;
c271c5c2 1686 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ccd979bd
MF
1687
1688 BUG_ON(!inode);
1689
1690 mlog_entry_void();
1691
b0697053
MF
1692 mlog(0, "inode %llu take %s RW lock\n",
1693 (unsigned long long)OCFS2_I(inode)->ip_blkno,
ccd979bd
MF
1694 write ? "EXMODE" : "PRMODE");
1695
d92bc512
CL
1696 if (ocfs2_mount_local(osb)) {
1697 mlog_exit(0);
c271c5c2 1698 return 0;
d92bc512 1699 }
c271c5c2 1700
ccd979bd
MF
1701 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1702
bd3e7610 1703 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
ccd979bd
MF
1704
1705 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1706 0);
1707 if (status < 0)
1708 mlog_errno(status);
1709
1710 mlog_exit(status);
1711 return status;
1712}
1713
1714void ocfs2_rw_unlock(struct inode *inode, int write)
1715{
bd3e7610 1716 int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
ccd979bd 1717 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
c271c5c2 1718 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ccd979bd
MF
1719
1720 mlog_entry_void();
1721
b0697053
MF
1722 mlog(0, "inode %llu drop %s RW lock\n",
1723 (unsigned long long)OCFS2_I(inode)->ip_blkno,
ccd979bd
MF
1724 write ? "EXMODE" : "PRMODE");
1725
c271c5c2
SM
1726 if (!ocfs2_mount_local(osb))
1727 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
ccd979bd
MF
1728
1729 mlog_exit_void();
1730}
1731
50008630
TY
1732/*
1733 * ocfs2_open_lock always get PR mode lock.
1734 */
1735int ocfs2_open_lock(struct inode *inode)
1736{
1737 int status = 0;
1738 struct ocfs2_lock_res *lockres;
1739 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1740
1741 BUG_ON(!inode);
1742
1743 mlog_entry_void();
1744
1745 mlog(0, "inode %llu take PRMODE open lock\n",
1746 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1747
1748 if (ocfs2_mount_local(osb))
1749 goto out;
1750
1751 lockres = &OCFS2_I(inode)->ip_open_lockres;
1752
1753 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
bd3e7610 1754 DLM_LOCK_PR, 0, 0);
50008630
TY
1755 if (status < 0)
1756 mlog_errno(status);
1757
1758out:
1759 mlog_exit(status);
1760 return status;
1761}
1762
1763int ocfs2_try_open_lock(struct inode *inode, int write)
1764{
1765 int status = 0, level;
1766 struct ocfs2_lock_res *lockres;
1767 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1768
1769 BUG_ON(!inode);
1770
1771 mlog_entry_void();
1772
1773 mlog(0, "inode %llu try to take %s open lock\n",
1774 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1775 write ? "EXMODE" : "PRMODE");
1776
1777 if (ocfs2_mount_local(osb))
1778 goto out;
1779
1780 lockres = &OCFS2_I(inode)->ip_open_lockres;
1781
bd3e7610 1782 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
50008630
TY
1783
1784 /*
1785 * The file system may already holding a PRMODE/EXMODE open lock.
bd3e7610 1786 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
50008630
TY
1787 * other nodes and the -EAGAIN will indicate to the caller that
1788 * this inode is still in use.
1789 */
1790 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
bd3e7610 1791 level, DLM_LKF_NOQUEUE, 0);
50008630
TY
1792
1793out:
1794 mlog_exit(status);
1795 return status;
1796}
1797
1798/*
1799 * ocfs2_open_unlock unlock PR and EX mode open locks.
1800 */
1801void ocfs2_open_unlock(struct inode *inode)
1802{
1803 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
1804 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1805
1806 mlog_entry_void();
1807
1808 mlog(0, "inode %llu drop open lock\n",
1809 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1810
1811 if (ocfs2_mount_local(osb))
1812 goto out;
1813
1814 if(lockres->l_ro_holders)
1815 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
bd3e7610 1816 DLM_LOCK_PR);
50008630
TY
1817 if(lockres->l_ex_holders)
1818 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
bd3e7610 1819 DLM_LOCK_EX);
50008630
TY
1820
1821out:
1822 mlog_exit_void();
1823}
1824
cf8e06f1
MF
1825static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
1826 int level)
1827{
1828 int ret;
1829 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
1830 unsigned long flags;
1831 struct ocfs2_mask_waiter mw;
1832
1833 ocfs2_init_mask_waiter(&mw);
1834
1835retry_cancel:
1836 spin_lock_irqsave(&lockres->l_lock, flags);
1837 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
1838 ret = ocfs2_prepare_cancel_convert(osb, lockres);
1839 if (ret) {
1840 spin_unlock_irqrestore(&lockres->l_lock, flags);
1841 ret = ocfs2_cancel_convert(osb, lockres);
1842 if (ret < 0) {
1843 mlog_errno(ret);
1844 goto out;
1845 }
1846 goto retry_cancel;
1847 }
1848 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1849 spin_unlock_irqrestore(&lockres->l_lock, flags);
1850
1851 ocfs2_wait_for_mask(&mw);
1852 goto retry_cancel;
1853 }
1854
1855 ret = -ERESTARTSYS;
1856 /*
1857 * We may still have gotten the lock, in which case there's no
1858 * point to restarting the syscall.
1859 */
1860 if (lockres->l_level == level)
1861 ret = 0;
1862
1863 mlog(0, "Cancel returning %d. flags: 0x%lx, level: %d, act: %d\n", ret,
1864 lockres->l_flags, lockres->l_level, lockres->l_action);
1865
1866 spin_unlock_irqrestore(&lockres->l_lock, flags);
1867
1868out:
1869 return ret;
1870}
1871
1872/*
1873 * ocfs2_file_lock() and ocfs2_file_unlock() map to a single pair of
1874 * flock() calls. The locking approach this requires is sufficiently
1875 * different from all other cluster lock types that we implement a
1876 * seperate path to the "low-level" dlm calls. In particular:
1877 *
1878 * - No optimization of lock levels is done - we take at exactly
1879 * what's been requested.
1880 *
1881 * - No lock caching is employed. We immediately downconvert to
1882 * no-lock at unlock time. This also means flock locks never go on
1883 * the blocking list).
1884 *
1885 * - Since userspace can trivially deadlock itself with flock, we make
1886 * sure to allow cancellation of a misbehaving applications flock()
1887 * request.
1888 *
1889 * - Access to any flock lockres doesn't require concurrency, so we
1890 * can simplify the code by requiring the caller to guarantee
1891 * serialization of dlmglue flock calls.
1892 */
1893int ocfs2_file_lock(struct file *file, int ex, int trylock)
1894{
e988cf1c
MF
1895 int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1896 unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
cf8e06f1
MF
1897 unsigned long flags;
1898 struct ocfs2_file_private *fp = file->private_data;
1899 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1900 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1901 struct ocfs2_mask_waiter mw;
1902
1903 ocfs2_init_mask_waiter(&mw);
1904
1905 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
bd3e7610 1906 (lockres->l_level > DLM_LOCK_NL)) {
cf8e06f1
MF
1907 mlog(ML_ERROR,
1908 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1909 "level: %u\n", lockres->l_name, lockres->l_flags,
1910 lockres->l_level);
1911 return -EINVAL;
1912 }
1913
1914 spin_lock_irqsave(&lockres->l_lock, flags);
1915 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1916 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1917 spin_unlock_irqrestore(&lockres->l_lock, flags);
1918
1919 /*
1920 * Get the lock at NLMODE to start - that way we
1921 * can cancel the upconvert request if need be.
1922 */
e988cf1c 1923 ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
cf8e06f1
MF
1924 if (ret < 0) {
1925 mlog_errno(ret);
1926 goto out;
1927 }
1928
1929 ret = ocfs2_wait_for_mask(&mw);
1930 if (ret) {
1931 mlog_errno(ret);
1932 goto out;
1933 }
1934 spin_lock_irqsave(&lockres->l_lock, flags);
1935 }
1936
1937 lockres->l_action = OCFS2_AST_CONVERT;
e988cf1c 1938 lkm_flags |= DLM_LKF_CONVERT;
cf8e06f1
MF
1939 lockres->l_requested = level;
1940 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1941
1942 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1943 spin_unlock_irqrestore(&lockres->l_lock, flags);
1944
4670c46d 1945 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
a796d286 1946 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1);
7431cd7e
JB
1947 if (ret) {
1948 if (!trylock || (ret != -EAGAIN)) {
24ef1815 1949 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
cf8e06f1
MF
1950 ret = -EINVAL;
1951 }
1952
1953 ocfs2_recover_from_dlm_error(lockres, 1);
1954 lockres_remove_mask_waiter(lockres, &mw);
1955 goto out;
1956 }
1957
1958 ret = ocfs2_wait_for_mask_interruptible(&mw, lockres);
1959 if (ret == -ERESTARTSYS) {
1960 /*
1961 * Userspace can cause deadlock itself with
1962 * flock(). Current behavior locally is to allow the
1963 * deadlock, but abort the system call if a signal is
1964 * received. We follow this example, otherwise a
1965 * poorly written program could sit in kernel until
1966 * reboot.
1967 *
1968 * Handling this is a bit more complicated for Ocfs2
1969 * though. We can't exit this function with an
1970 * outstanding lock request, so a cancel convert is
1971 * required. We intentionally overwrite 'ret' - if the
1972 * cancel fails and the lock was granted, it's easier
af901ca1 1973 * to just bubble success back up to the user.
cf8e06f1
MF
1974 */
1975 ret = ocfs2_flock_handle_signal(lockres, level);
1693a5c0
DT
1976 } else if (!ret && (level > lockres->l_level)) {
1977 /* Trylock failed asynchronously */
1978 BUG_ON(!trylock);
1979 ret = -EAGAIN;
cf8e06f1
MF
1980 }
1981
1982out:
1983
1984 mlog(0, "Lock: \"%s\" ex: %d, trylock: %d, returns: %d\n",
1985 lockres->l_name, ex, trylock, ret);
1986 return ret;
1987}
1988
1989void ocfs2_file_unlock(struct file *file)
1990{
1991 int ret;
de551246 1992 unsigned int gen;
cf8e06f1
MF
1993 unsigned long flags;
1994 struct ocfs2_file_private *fp = file->private_data;
1995 struct ocfs2_lock_res *lockres = &fp->fp_flock;
1996 struct ocfs2_super *osb = OCFS2_SB(file->f_mapping->host->i_sb);
1997 struct ocfs2_mask_waiter mw;
1998
1999 ocfs2_init_mask_waiter(&mw);
2000
2001 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
2002 return;
2003
e988cf1c 2004 if (lockres->l_level == DLM_LOCK_NL)
cf8e06f1
MF
2005 return;
2006
2007 mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
2008 lockres->l_name, lockres->l_flags, lockres->l_level,
2009 lockres->l_action);
2010
2011 spin_lock_irqsave(&lockres->l_lock, flags);
2012 /*
2013 * Fake a blocking ast for the downconvert code.
2014 */
2015 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
bd3e7610 2016 lockres->l_blocking = DLM_LOCK_EX;
cf8e06f1 2017
e988cf1c 2018 gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
cf8e06f1
MF
2019 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
2020 spin_unlock_irqrestore(&lockres->l_lock, flags);
2021
e988cf1c 2022 ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
cf8e06f1
MF
2023 if (ret) {
2024 mlog_errno(ret);
2025 return;
2026 }
2027
2028 ret = ocfs2_wait_for_mask(&mw);
2029 if (ret)
2030 mlog_errno(ret);
2031}
2032
34d024f8
MF
2033static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
2034 struct ocfs2_lock_res *lockres)
ccd979bd
MF
2035{
2036 int kick = 0;
2037
2038 mlog_entry_void();
2039
2040 /* If we know that another node is waiting on our lock, kick
34d024f8 2041 * the downconvert thread * pre-emptively when we reach a release
ccd979bd
MF
2042 * condition. */
2043 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
2044 switch(lockres->l_blocking) {
bd3e7610 2045 case DLM_LOCK_EX:
ccd979bd
MF
2046 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
2047 kick = 1;
2048 break;
bd3e7610 2049 case DLM_LOCK_PR:
ccd979bd
MF
2050 if (!lockres->l_ex_holders)
2051 kick = 1;
2052 break;
2053 default:
2054 BUG();
2055 }
2056 }
2057
2058 if (kick)
34d024f8 2059 ocfs2_wake_downconvert_thread(osb);
ccd979bd
MF
2060
2061 mlog_exit_void();
2062}
2063
ccd979bd
MF
2064#define OCFS2_SEC_BITS 34
2065#define OCFS2_SEC_SHIFT (64 - 34)
2066#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1)
2067
2068/* LVB only has room for 64 bits of time here so we pack it for
2069 * now. */
2070static u64 ocfs2_pack_timespec(struct timespec *spec)
2071{
2072 u64 res;
2073 u64 sec = spec->tv_sec;
2074 u32 nsec = spec->tv_nsec;
2075
2076 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
2077
2078 return res;
2079}
2080
2081/* Call this with the lockres locked. I am reasonably sure we don't
2082 * need ip_lock in this function as anyone who would be changing those
e63aecb6 2083 * values is supposed to be blocked in ocfs2_inode_lock right now. */
ccd979bd
MF
2084static void __ocfs2_stuff_meta_lvb(struct inode *inode)
2085{
2086 struct ocfs2_inode_info *oi = OCFS2_I(inode);
e63aecb6 2087 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
ccd979bd
MF
2088 struct ocfs2_meta_lvb *lvb;
2089
2090 mlog_entry_void();
2091
a641dc2a 2092 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
ccd979bd 2093
24c19ef4
MF
2094 /*
2095 * Invalidate the LVB of a deleted inode - this way other
2096 * nodes are forced to go to disk and discover the new inode
2097 * status.
2098 */
2099 if (oi->ip_flags & OCFS2_INODE_DELETED) {
2100 lvb->lvb_version = 0;
2101 goto out;
2102 }
2103
4d3b83f7 2104 lvb->lvb_version = OCFS2_LVB_VERSION;
ccd979bd
MF
2105 lvb->lvb_isize = cpu_to_be64(i_size_read(inode));
2106 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
2107 lvb->lvb_iuid = cpu_to_be32(inode->i_uid);
2108 lvb->lvb_igid = cpu_to_be32(inode->i_gid);
2109 lvb->lvb_imode = cpu_to_be16(inode->i_mode);
2110 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink);
2111 lvb->lvb_iatime_packed =
2112 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
2113 lvb->lvb_ictime_packed =
2114 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
2115 lvb->lvb_imtime_packed =
2116 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
ca4d147e 2117 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
15b1e36b 2118 lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
f9e2d82e 2119 lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
ccd979bd 2120
24c19ef4 2121out:
ccd979bd
MF
2122 mlog_meta_lvb(0, lockres);
2123
2124 mlog_exit_void();
2125}
2126
2127static void ocfs2_unpack_timespec(struct timespec *spec,
2128 u64 packed_time)
2129{
2130 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
2131 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
2132}
2133
2134static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
2135{
2136 struct ocfs2_inode_info *oi = OCFS2_I(inode);
e63aecb6 2137 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
ccd979bd
MF
2138 struct ocfs2_meta_lvb *lvb;
2139
2140 mlog_entry_void();
2141
2142 mlog_meta_lvb(0, lockres);
2143
a641dc2a 2144 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
ccd979bd
MF
2145
2146 /* We're safe here without the lockres lock... */
2147 spin_lock(&oi->ip_lock);
2148 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
2149 i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
2150
ca4d147e 2151 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
15b1e36b 2152 oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
ca4d147e
HP
2153 ocfs2_set_inode_flags(inode);
2154
ccd979bd
MF
2155 /* fast-symlinks are a special case */
2156 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
2157 inode->i_blocks = 0;
2158 else
8110b073 2159 inode->i_blocks = ocfs2_inode_sector_count(inode);
ccd979bd
MF
2160
2161 inode->i_uid = be32_to_cpu(lvb->lvb_iuid);
2162 inode->i_gid = be32_to_cpu(lvb->lvb_igid);
2163 inode->i_mode = be16_to_cpu(lvb->lvb_imode);
2164 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink);
2165 ocfs2_unpack_timespec(&inode->i_atime,
2166 be64_to_cpu(lvb->lvb_iatime_packed));
2167 ocfs2_unpack_timespec(&inode->i_mtime,
2168 be64_to_cpu(lvb->lvb_imtime_packed));
2169 ocfs2_unpack_timespec(&inode->i_ctime,
2170 be64_to_cpu(lvb->lvb_ictime_packed));
2171 spin_unlock(&oi->ip_lock);
2172
2173 mlog_exit_void();
2174}
2175
f9e2d82e
MF
2176static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
2177 struct ocfs2_lock_res *lockres)
ccd979bd 2178{
a641dc2a 2179 struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
ccd979bd 2180
1c520dfb
JB
2181 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)
2182 && lvb->lvb_version == OCFS2_LVB_VERSION
f9e2d82e 2183 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
ccd979bd
MF
2184 return 1;
2185 return 0;
2186}
2187
2188/* Determine whether a lock resource needs to be refreshed, and
2189 * arbitrate who gets to refresh it.
2190 *
2191 * 0 means no refresh needed.
2192 *
2193 * > 0 means you need to refresh this and you MUST call
2194 * ocfs2_complete_lock_res_refresh afterwards. */
2195static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
2196{
2197 unsigned long flags;
2198 int status = 0;
2199
2200 mlog_entry_void();
2201
2202refresh_check:
2203 spin_lock_irqsave(&lockres->l_lock, flags);
2204 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2205 spin_unlock_irqrestore(&lockres->l_lock, flags);
2206 goto bail;
2207 }
2208
2209 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2210 spin_unlock_irqrestore(&lockres->l_lock, flags);
2211
2212 ocfs2_wait_on_refreshing_lock(lockres);
2213 goto refresh_check;
2214 }
2215
2216 /* Ok, I'll be the one to refresh this lock. */
2217 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
2218 spin_unlock_irqrestore(&lockres->l_lock, flags);
2219
2220 status = 1;
2221bail:
2222 mlog_exit(status);
2223 return status;
2224}
2225
2226/* If status is non zero, I'll mark it as not being in refresh
2227 * anymroe, but i won't clear the needs refresh flag. */
2228static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
2229 int status)
2230{
2231 unsigned long flags;
2232 mlog_entry_void();
2233
2234 spin_lock_irqsave(&lockres->l_lock, flags);
2235 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
2236 if (!status)
2237 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
2238 spin_unlock_irqrestore(&lockres->l_lock, flags);
2239
2240 wake_up(&lockres->l_event);
2241
2242 mlog_exit_void();
2243}
2244
2245/* may or may not return a bh if it went to disk. */
e63aecb6 2246static int ocfs2_inode_lock_update(struct inode *inode,
ccd979bd
MF
2247 struct buffer_head **bh)
2248{
2249 int status = 0;
2250 struct ocfs2_inode_info *oi = OCFS2_I(inode);
e63aecb6 2251 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
ccd979bd 2252 struct ocfs2_dinode *fe;
c271c5c2 2253 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ccd979bd
MF
2254
2255 mlog_entry_void();
2256
be9e986b
MF
2257 if (ocfs2_mount_local(osb))
2258 goto bail;
2259
ccd979bd
MF
2260 spin_lock(&oi->ip_lock);
2261 if (oi->ip_flags & OCFS2_INODE_DELETED) {
b0697053 2262 mlog(0, "Orphaned inode %llu was deleted while we "
ccd979bd 2263 "were waiting on a lock. ip_flags = 0x%x\n",
b0697053 2264 (unsigned long long)oi->ip_blkno, oi->ip_flags);
ccd979bd
MF
2265 spin_unlock(&oi->ip_lock);
2266 status = -ENOENT;
2267 goto bail;
2268 }
2269 spin_unlock(&oi->ip_lock);
2270
be9e986b
MF
2271 if (!ocfs2_should_refresh_lock_res(lockres))
2272 goto bail;
ccd979bd
MF
2273
2274 /* This will discard any caching information we might have had
2275 * for the inode metadata. */
8cb471e8 2276 ocfs2_metadata_cache_purge(INODE_CACHE(inode));
ccd979bd 2277
83418978
MF
2278 ocfs2_extent_map_trunc(inode, 0);
2279
be9e986b 2280 if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
b0697053
MF
2281 mlog(0, "Trusting LVB on inode %llu\n",
2282 (unsigned long long)oi->ip_blkno);
ccd979bd
MF
2283 ocfs2_refresh_inode_from_lvb(inode);
2284 } else {
2285 /* Boo, we have to go to disk. */
2286 /* read bh, cast, ocfs2_refresh_inode */
b657c95c 2287 status = ocfs2_read_inode_block(inode, bh);
ccd979bd
MF
2288 if (status < 0) {
2289 mlog_errno(status);
2290 goto bail_refresh;
2291 }
2292 fe = (struct ocfs2_dinode *) (*bh)->b_data;
2293
2294 /* This is a good chance to make sure we're not
b657c95c
JB
2295 * locking an invalid object. ocfs2_read_inode_block()
2296 * already checked that the inode block is sane.
ccd979bd
MF
2297 *
2298 * We bug on a stale inode here because we checked
2299 * above whether it was wiped from disk. The wiping
2300 * node provides a guarantee that we receive that
2301 * message and can mark the inode before dropping any
2302 * locks associated with it. */
ccd979bd
MF
2303 mlog_bug_on_msg(inode->i_generation !=
2304 le32_to_cpu(fe->i_generation),
b0697053 2305 "Invalid dinode %llu disk generation: %u "
ccd979bd 2306 "inode->i_generation: %u\n",
b0697053
MF
2307 (unsigned long long)oi->ip_blkno,
2308 le32_to_cpu(fe->i_generation),
ccd979bd
MF
2309 inode->i_generation);
2310 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
2311 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
b0697053
MF
2312 "Stale dinode %llu dtime: %llu flags: 0x%x\n",
2313 (unsigned long long)oi->ip_blkno,
2314 (unsigned long long)le64_to_cpu(fe->i_dtime),
ccd979bd
MF
2315 le32_to_cpu(fe->i_flags));
2316
2317 ocfs2_refresh_inode(inode, fe);
8ddb7b00 2318 ocfs2_track_lock_refresh(lockres);
ccd979bd
MF
2319 }
2320
2321 status = 0;
2322bail_refresh:
be9e986b 2323 ocfs2_complete_lock_res_refresh(lockres, status);
ccd979bd
MF
2324bail:
2325 mlog_exit(status);
2326 return status;
2327}
2328
2329static int ocfs2_assign_bh(struct inode *inode,
2330 struct buffer_head **ret_bh,
2331 struct buffer_head *passed_bh)
2332{
2333 int status;
2334
2335 if (passed_bh) {
2336 /* Ok, the update went to disk for us, use the
2337 * returned bh. */
2338 *ret_bh = passed_bh;
2339 get_bh(*ret_bh);
2340
2341 return 0;
2342 }
2343
b657c95c 2344 status = ocfs2_read_inode_block(inode, ret_bh);
ccd979bd
MF
2345 if (status < 0)
2346 mlog_errno(status);
2347
2348 return status;
2349}
2350
2351/*
2352 * returns < 0 error if the callback will never be called, otherwise
2353 * the result of the lock will be communicated via the callback.
2354 */
cb25797d
JK
2355int ocfs2_inode_lock_full_nested(struct inode *inode,
2356 struct buffer_head **ret_bh,
2357 int ex,
2358 int arg_flags,
2359 int subclass)
ccd979bd 2360{
bd3e7610
JB
2361 int status, level, acquired;
2362 u32 dlm_flags;
c271c5c2 2363 struct ocfs2_lock_res *lockres = NULL;
ccd979bd
MF
2364 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2365 struct buffer_head *local_bh = NULL;
2366
2367 BUG_ON(!inode);
2368
2369 mlog_entry_void();
2370
b0697053
MF
2371 mlog(0, "inode %llu, take %s META lock\n",
2372 (unsigned long long)OCFS2_I(inode)->ip_blkno,
ccd979bd
MF
2373 ex ? "EXMODE" : "PRMODE");
2374
2375 status = 0;
2376 acquired = 0;
2377 /* We'll allow faking a readonly metadata lock for
2378 * rodevices. */
2379 if (ocfs2_is_hard_readonly(osb)) {
2380 if (ex)
2381 status = -EROFS;
2382 goto bail;
2383 }
2384
c271c5c2
SM
2385 if (ocfs2_mount_local(osb))
2386 goto local;
2387
ccd979bd 2388 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
553abd04 2389 ocfs2_wait_for_recovery(osb);
ccd979bd 2390
e63aecb6 2391 lockres = &OCFS2_I(inode)->ip_inode_lockres;
bd3e7610 2392 level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
ccd979bd
MF
2393 dlm_flags = 0;
2394 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
bd3e7610 2395 dlm_flags |= DLM_LKF_NOQUEUE;
ccd979bd 2396
cb25797d
JK
2397 status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
2398 arg_flags, subclass, _RET_IP_);
ccd979bd
MF
2399 if (status < 0) {
2400 if (status != -EAGAIN && status != -EIOCBRETRY)
2401 mlog_errno(status);
2402 goto bail;
2403 }
2404
2405 /* Notify the error cleanup path to drop the cluster lock. */
2406 acquired = 1;
2407
2408 /* We wait twice because a node may have died while we were in
2409 * the lower dlm layers. The second time though, we've
2410 * committed to owning this lock so we don't allow signals to
2411 * abort the operation. */
2412 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
553abd04 2413 ocfs2_wait_for_recovery(osb);
ccd979bd 2414
c271c5c2 2415local:
24c19ef4
MF
2416 /*
2417 * We only see this flag if we're being called from
2418 * ocfs2_read_locked_inode(). It means we're locking an inode
2419 * which hasn't been populated yet, so clear the refresh flag
2420 * and let the caller handle it.
2421 */
2422 if (inode->i_state & I_NEW) {
2423 status = 0;
c271c5c2
SM
2424 if (lockres)
2425 ocfs2_complete_lock_res_refresh(lockres, 0);
24c19ef4
MF
2426 goto bail;
2427 }
2428
ccd979bd 2429 /* This is fun. The caller may want a bh back, or it may
e63aecb6 2430 * not. ocfs2_inode_lock_update definitely wants one in, but
ccd979bd
MF
2431 * may or may not read one, depending on what's in the
2432 * LVB. The result of all of this is that we've *only* gone to
2433 * disk if we have to, so the complexity is worthwhile. */
e63aecb6 2434 status = ocfs2_inode_lock_update(inode, &local_bh);
ccd979bd
MF
2435 if (status < 0) {
2436 if (status != -ENOENT)
2437 mlog_errno(status);
2438 goto bail;
2439 }
2440
2441 if (ret_bh) {
2442 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
2443 if (status < 0) {
2444 mlog_errno(status);
2445 goto bail;
2446 }
2447 }
2448
ccd979bd
MF
2449bail:
2450 if (status < 0) {
2451 if (ret_bh && (*ret_bh)) {
2452 brelse(*ret_bh);
2453 *ret_bh = NULL;
2454 }
2455 if (acquired)
e63aecb6 2456 ocfs2_inode_unlock(inode, ex);
ccd979bd
MF
2457 }
2458
2459 if (local_bh)
2460 brelse(local_bh);
2461
2462 mlog_exit(status);
2463 return status;
2464}
2465
2466/*
34d024f8
MF
2467 * This is working around a lock inversion between tasks acquiring DLM
2468 * locks while holding a page lock and the downconvert thread which
2469 * blocks dlm lock acquiry while acquiring page locks.
ccd979bd
MF
2470 *
2471 * ** These _with_page variantes are only intended to be called from aop
2472 * methods that hold page locks and return a very specific *positive* error
2473 * code that aop methods pass up to the VFS -- test for errors with != 0. **
2474 *
34d024f8
MF
2475 * The DLM is called such that it returns -EAGAIN if it would have
2476 * blocked waiting for the downconvert thread. In that case we unlock
2477 * our page so the downconvert thread can make progress. Once we've
2478 * done this we have to return AOP_TRUNCATED_PAGE so the aop method
2479 * that called us can bubble that back up into the VFS who will then
2480 * immediately retry the aop call.
ccd979bd
MF
2481 *
2482 * We do a blocking lock and immediate unlock before returning, though, so that
2483 * the lock has a great chance of being cached on this node by the time the VFS
2484 * calls back to retry the aop. This has a potential to livelock as nodes
2485 * ping locks back and forth, but that's a risk we're willing to take to avoid
2486 * the lock inversion simply.
2487 */
e63aecb6 2488int ocfs2_inode_lock_with_page(struct inode *inode,
ccd979bd
MF
2489 struct buffer_head **ret_bh,
2490 int ex,
2491 struct page *page)
2492{
2493 int ret;
2494
e63aecb6 2495 ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
ccd979bd
MF
2496 if (ret == -EAGAIN) {
2497 unlock_page(page);
e63aecb6
MF
2498 if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
2499 ocfs2_inode_unlock(inode, ex);
ccd979bd
MF
2500 ret = AOP_TRUNCATED_PAGE;
2501 }
2502
2503 return ret;
2504}
2505
e63aecb6 2506int ocfs2_inode_lock_atime(struct inode *inode,
7f1a37e3
TY
2507 struct vfsmount *vfsmnt,
2508 int *level)
2509{
2510 int ret;
2511
2512 mlog_entry_void();
e63aecb6 2513 ret = ocfs2_inode_lock(inode, NULL, 0);
7f1a37e3
TY
2514 if (ret < 0) {
2515 mlog_errno(ret);
2516 return ret;
2517 }
2518
2519 /*
2520 * If we should update atime, we will get EX lock,
2521 * otherwise we just get PR lock.
2522 */
2523 if (ocfs2_should_update_atime(inode, vfsmnt)) {
2524 struct buffer_head *bh = NULL;
2525
e63aecb6
MF
2526 ocfs2_inode_unlock(inode, 0);
2527 ret = ocfs2_inode_lock(inode, &bh, 1);
7f1a37e3
TY
2528 if (ret < 0) {
2529 mlog_errno(ret);
2530 return ret;
2531 }
2532 *level = 1;
2533 if (ocfs2_should_update_atime(inode, vfsmnt))
2534 ocfs2_update_inode_atime(inode, bh);
2535 if (bh)
2536 brelse(bh);
2537 } else
2538 *level = 0;
2539
2540 mlog_exit(ret);
2541 return ret;
2542}
2543
e63aecb6 2544void ocfs2_inode_unlock(struct inode *inode,
ccd979bd
MF
2545 int ex)
2546{
bd3e7610 2547 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
e63aecb6 2548 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
c271c5c2 2549 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
ccd979bd
MF
2550
2551 mlog_entry_void();
2552
b0697053
MF
2553 mlog(0, "inode %llu drop %s META lock\n",
2554 (unsigned long long)OCFS2_I(inode)->ip_blkno,
ccd979bd
MF
2555 ex ? "EXMODE" : "PRMODE");
2556
c271c5c2
SM
2557 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
2558 !ocfs2_mount_local(osb))
ccd979bd
MF
2559 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
2560
2561 mlog_exit_void();
2562}
2563
df152c24 2564int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
83273932
SE
2565{
2566 struct ocfs2_lock_res *lockres;
2567 struct ocfs2_orphan_scan_lvb *lvb;
83273932
SE
2568 int status = 0;
2569
df152c24
SM
2570 if (ocfs2_is_hard_readonly(osb))
2571 return -EROFS;
2572
2573 if (ocfs2_mount_local(osb))
2574 return 0;
2575
83273932 2576 lockres = &osb->osb_orphan_scan.os_lockres;
df152c24 2577 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
83273932
SE
2578 if (status < 0)
2579 return status;
2580
2581 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
1c520dfb
JB
2582 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
2583 lvb->lvb_version == OCFS2_ORPHAN_LVB_VERSION)
83273932 2584 *seqno = be32_to_cpu(lvb->lvb_os_seqno);
3211949f
SM
2585 else
2586 *seqno = osb->osb_orphan_scan.os_seqno + 1;
2587
83273932
SE
2588 return status;
2589}
2590
df152c24 2591void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno)
83273932
SE
2592{
2593 struct ocfs2_lock_res *lockres;
2594 struct ocfs2_orphan_scan_lvb *lvb;
83273932 2595
df152c24
SM
2596 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) {
2597 lockres = &osb->osb_orphan_scan.os_lockres;
2598 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2599 lvb->lvb_version = OCFS2_ORPHAN_LVB_VERSION;
2600 lvb->lvb_os_seqno = cpu_to_be32(seqno);
2601 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2602 }
83273932
SE
2603}
2604
ccd979bd
MF
2605int ocfs2_super_lock(struct ocfs2_super *osb,
2606 int ex)
2607{
c271c5c2 2608 int status = 0;
bd3e7610 2609 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
ccd979bd 2610 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
ccd979bd
MF
2611
2612 mlog_entry_void();
2613
2614 if (ocfs2_is_hard_readonly(osb))
2615 return -EROFS;
2616
c271c5c2
SM
2617 if (ocfs2_mount_local(osb))
2618 goto bail;
2619
ccd979bd
MF
2620 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
2621 if (status < 0) {
2622 mlog_errno(status);
2623 goto bail;
2624 }
2625
2626 /* The super block lock path is really in the best position to
2627 * know when resources covered by the lock need to be
2628 * refreshed, so we do it here. Of course, making sense of
2629 * everything is up to the caller :) */
2630 status = ocfs2_should_refresh_lock_res(lockres);
2631 if (status < 0) {
2632 mlog_errno(status);
2633 goto bail;
2634 }
2635 if (status) {
8e8a4603 2636 status = ocfs2_refresh_slot_info(osb);
ccd979bd
MF
2637
2638 ocfs2_complete_lock_res_refresh(lockres, status);
2639
2640 if (status < 0)
2641 mlog_errno(status);
8ddb7b00 2642 ocfs2_track_lock_refresh(lockres);
ccd979bd
MF
2643 }
2644bail:
2645 mlog_exit(status);
2646 return status;
2647}
2648
2649void ocfs2_super_unlock(struct ocfs2_super *osb,
2650 int ex)
2651{
bd3e7610 2652 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
ccd979bd
MF
2653 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2654
c271c5c2
SM
2655 if (!ocfs2_mount_local(osb))
2656 ocfs2_cluster_unlock(osb, lockres, level);
ccd979bd
MF
2657}
2658
2659int ocfs2_rename_lock(struct ocfs2_super *osb)
2660{
2661 int status;
2662 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2663
2664 if (ocfs2_is_hard_readonly(osb))
2665 return -EROFS;
2666
c271c5c2
SM
2667 if (ocfs2_mount_local(osb))
2668 return 0;
2669
bd3e7610 2670 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
ccd979bd
MF
2671 if (status < 0)
2672 mlog_errno(status);
2673
2674 return status;
2675}
2676
2677void ocfs2_rename_unlock(struct ocfs2_super *osb)
2678{
2679 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2680
c271c5c2 2681 if (!ocfs2_mount_local(osb))
bd3e7610 2682 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
ccd979bd
MF
2683}
2684
6ca497a8 2685int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
2686{
2687 int status;
2688 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2689
2690 if (ocfs2_is_hard_readonly(osb))
2691 return -EROFS;
2692
2693 if (ocfs2_mount_local(osb))
2694 return 0;
2695
2696 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
2697 0, 0);
2698 if (status < 0)
2699 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
2700
2701 return status;
2702}
2703
2704void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
2705{
2706 struct ocfs2_lock_res *lockres = &osb->osb_nfs_sync_lockres;
2707
2708 if (!ocfs2_mount_local(osb))
2709 ocfs2_cluster_unlock(osb, lockres,
2710 ex ? LKM_EXMODE : LKM_PRMODE);
2711}
2712
d680efe9
MF
2713int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2714{
2715 int ret;
bd3e7610 2716 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
d680efe9
MF
2717 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2718 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2719
2720 BUG_ON(!dl);
2721
2722 if (ocfs2_is_hard_readonly(osb))
2723 return -EROFS;
2724
c271c5c2
SM
2725 if (ocfs2_mount_local(osb))
2726 return 0;
2727
d680efe9
MF
2728 ret = ocfs2_cluster_lock(osb, &dl->dl_lockres, level, 0, 0);
2729 if (ret < 0)
2730 mlog_errno(ret);
2731
2732 return ret;
2733}
2734
2735void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2736{
bd3e7610 2737 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
d680efe9
MF
2738 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2739 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2740
c271c5c2
SM
2741 if (!ocfs2_mount_local(osb))
2742 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
d680efe9
MF
2743}
2744
ccd979bd
MF
2745/* Reference counting of the dlm debug structure. We want this because
2746 * open references on the debug inodes can live on after a mount, so
2747 * we can't rely on the ocfs2_super to always exist. */
2748static void ocfs2_dlm_debug_free(struct kref *kref)
2749{
2750 struct ocfs2_dlm_debug *dlm_debug;
2751
2752 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
2753
2754 kfree(dlm_debug);
2755}
2756
2757void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
2758{
2759 if (dlm_debug)
2760 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
2761}
2762
2763static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
2764{
2765 kref_get(&debug->d_refcnt);
2766}
2767
2768struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
2769{
2770 struct ocfs2_dlm_debug *dlm_debug;
2771
2772 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
2773 if (!dlm_debug) {
2774 mlog_errno(-ENOMEM);
2775 goto out;
2776 }
2777
2778 kref_init(&dlm_debug->d_refcnt);
2779 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
2780 dlm_debug->d_locking_state = NULL;
2781out:
2782 return dlm_debug;
2783}
2784
2785/* Access to this is arbitrated for us via seq_file->sem. */
2786struct ocfs2_dlm_seq_priv {
2787 struct ocfs2_dlm_debug *p_dlm_debug;
2788 struct ocfs2_lock_res p_iter_res;
2789 struct ocfs2_lock_res p_tmp_res;
2790};
2791
2792static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
2793 struct ocfs2_dlm_seq_priv *priv)
2794{
2795 struct ocfs2_lock_res *iter, *ret = NULL;
2796 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
2797
2798 assert_spin_locked(&ocfs2_dlm_tracking_lock);
2799
2800 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
2801 /* discover the head of the list */
2802 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
2803 mlog(0, "End of list found, %p\n", ret);
2804 break;
2805 }
2806
2807 /* We track our "dummy" iteration lockres' by a NULL
2808 * l_ops field. */
2809 if (iter->l_ops != NULL) {
2810 ret = iter;
2811 break;
2812 }
2813 }
2814
2815 return ret;
2816}
2817
2818static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
2819{
2820 struct ocfs2_dlm_seq_priv *priv = m->private;
2821 struct ocfs2_lock_res *iter;
2822
2823 spin_lock(&ocfs2_dlm_tracking_lock);
2824 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
2825 if (iter) {
2826 /* Since lockres' have the lifetime of their container
2827 * (which can be inodes, ocfs2_supers, etc) we want to
2828 * copy this out to a temporary lockres while still
2829 * under the spinlock. Obviously after this we can't
2830 * trust any pointers on the copy returned, but that's
2831 * ok as the information we want isn't typically held
2832 * in them. */
2833 priv->p_tmp_res = *iter;
2834 iter = &priv->p_tmp_res;
2835 }
2836 spin_unlock(&ocfs2_dlm_tracking_lock);
2837
2838 return iter;
2839}
2840
2841static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
2842{
2843}
2844
2845static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
2846{
2847 struct ocfs2_dlm_seq_priv *priv = m->private;
2848 struct ocfs2_lock_res *iter = v;
2849 struct ocfs2_lock_res *dummy = &priv->p_iter_res;
2850
2851 spin_lock(&ocfs2_dlm_tracking_lock);
2852 iter = ocfs2_dlm_next_res(iter, priv);
2853 list_del_init(&dummy->l_debug_list);
2854 if (iter) {
2855 list_add(&dummy->l_debug_list, &iter->l_debug_list);
2856 priv->p_tmp_res = *iter;
2857 iter = &priv->p_tmp_res;
2858 }
2859 spin_unlock(&ocfs2_dlm_tracking_lock);
2860
2861 return iter;
2862}
2863
2864/* So that debugfs.ocfs2 can determine which format is being used */
8ddb7b00 2865#define OCFS2_DLM_DEBUG_STR_VERSION 2
ccd979bd
MF
2866static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2867{
2868 int i;
2869 char *lvb;
2870 struct ocfs2_lock_res *lockres = v;
2871
2872 if (!lockres)
2873 return -EINVAL;
2874
d680efe9
MF
2875 seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION);
2876
2877 if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY)
2878 seq_printf(m, "%.*s%08x\t", OCFS2_DENTRY_LOCK_INO_START - 1,
2879 lockres->l_name,
2880 (unsigned int)ocfs2_get_dentry_lock_ino(lockres));
2881 else
2882 seq_printf(m, "%.*s\t", OCFS2_LOCK_ID_MAX_LEN, lockres->l_name);
2883
2884 seq_printf(m, "%d\t"
ccd979bd
MF
2885 "0x%lx\t"
2886 "0x%x\t"
2887 "0x%x\t"
2888 "%u\t"
2889 "%u\t"
2890 "%d\t"
2891 "%d\t",
ccd979bd
MF
2892 lockres->l_level,
2893 lockres->l_flags,
2894 lockres->l_action,
2895 lockres->l_unlock_action,
2896 lockres->l_ro_holders,
2897 lockres->l_ex_holders,
2898 lockres->l_requested,
2899 lockres->l_blocking);
2900
2901 /* Dump the raw LVB */
8f2c9c1b 2902 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
ccd979bd
MF
2903 for(i = 0; i < DLM_LVB_LEN; i++)
2904 seq_printf(m, "0x%x\t", lvb[i]);
2905
8ddb7b00
SM
2906#ifdef CONFIG_OCFS2_FS_STATS
2907# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
2908# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
2909# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
2910# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
2911# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
2912# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
2913# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
2914# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
2915# define lock_refresh(_l) (_l)->l_lock_refresh
2916#else
dd25e55e
RD
2917# define lock_num_prmode(_l) (0ULL)
2918# define lock_num_exmode(_l) (0ULL)
8ddb7b00
SM
2919# define lock_num_prmode_failed(_l) (0)
2920# define lock_num_exmode_failed(_l) (0)
dd25e55e
RD
2921# define lock_total_prmode(_l) (0ULL)
2922# define lock_total_exmode(_l) (0ULL)
8ddb7b00
SM
2923# define lock_max_prmode(_l) (0)
2924# define lock_max_exmode(_l) (0)
2925# define lock_refresh(_l) (0)
2926#endif
2927 /* The following seq_print was added in version 2 of this output */
2928 seq_printf(m, "%llu\t"
2929 "%llu\t"
2930 "%u\t"
2931 "%u\t"
2932 "%llu\t"
2933 "%llu\t"
2934 "%u\t"
2935 "%u\t"
2936 "%u\t",
2937 lock_num_prmode(lockres),
2938 lock_num_exmode(lockres),
2939 lock_num_prmode_failed(lockres),
2940 lock_num_exmode_failed(lockres),
2941 lock_total_prmode(lockres),
2942 lock_total_exmode(lockres),
2943 lock_max_prmode(lockres),
2944 lock_max_exmode(lockres),
2945 lock_refresh(lockres));
2946
ccd979bd
MF
2947 /* End the line */
2948 seq_printf(m, "\n");
2949 return 0;
2950}
2951
90d99779 2952static const struct seq_operations ocfs2_dlm_seq_ops = {
ccd979bd
MF
2953 .start = ocfs2_dlm_seq_start,
2954 .stop = ocfs2_dlm_seq_stop,
2955 .next = ocfs2_dlm_seq_next,
2956 .show = ocfs2_dlm_seq_show,
2957};
2958
2959static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2960{
2961 struct seq_file *seq = (struct seq_file *) file->private_data;
2962 struct ocfs2_dlm_seq_priv *priv = seq->private;
2963 struct ocfs2_lock_res *res = &priv->p_iter_res;
2964
2965 ocfs2_remove_lockres_tracking(res);
2966 ocfs2_put_dlm_debug(priv->p_dlm_debug);
2967 return seq_release_private(inode, file);
2968}
2969
2970static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
2971{
2972 int ret;
2973 struct ocfs2_dlm_seq_priv *priv;
2974 struct seq_file *seq;
2975 struct ocfs2_super *osb;
2976
2977 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
2978 if (!priv) {
2979 ret = -ENOMEM;
2980 mlog_errno(ret);
2981 goto out;
2982 }
8e18e294 2983 osb = inode->i_private;
ccd979bd
MF
2984 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2985 priv->p_dlm_debug = osb->osb_dlm_debug;
2986 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2987
2988 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2989 if (ret) {
2990 kfree(priv);
2991 mlog_errno(ret);
2992 goto out;
2993 }
2994
2995 seq = (struct seq_file *) file->private_data;
2996 seq->private = priv;
2997
2998 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2999 priv->p_dlm_debug);
3000
3001out:
3002 return ret;
3003}
3004
4b6f5d20 3005static const struct file_operations ocfs2_dlm_debug_fops = {
ccd979bd
MF
3006 .open = ocfs2_dlm_debug_open,
3007 .release = ocfs2_dlm_debug_release,
3008 .read = seq_read,
3009 .llseek = seq_lseek,
3010};
3011
3012static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
3013{
3014 int ret = 0;
3015 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
3016
3017 dlm_debug->d_locking_state = debugfs_create_file("locking_state",
3018 S_IFREG|S_IRUSR,
3019 osb->osb_debug_root,
3020 osb,
3021 &ocfs2_dlm_debug_fops);
3022 if (!dlm_debug->d_locking_state) {
3023 ret = -EINVAL;
3024 mlog(ML_ERROR,
3025 "Unable to create locking state debugfs file.\n");
3026 goto out;
3027 }
3028
3029 ocfs2_get_dlm_debug(dlm_debug);
3030out:
3031 return ret;
3032}
3033
3034static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
3035{
3036 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
3037
3038 if (dlm_debug) {
3039 debugfs_remove(dlm_debug->d_locking_state);
3040 ocfs2_put_dlm_debug(dlm_debug);
3041 }
3042}
3043
3044int ocfs2_dlm_init(struct ocfs2_super *osb)
3045{
c271c5c2 3046 int status = 0;
4670c46d 3047 struct ocfs2_cluster_connection *conn = NULL;
ccd979bd
MF
3048
3049 mlog_entry_void();
3050
0abd6d18
MF
3051 if (ocfs2_mount_local(osb)) {
3052 osb->node_num = 0;
c271c5c2 3053 goto local;
0abd6d18 3054 }
c271c5c2 3055
ccd979bd
MF
3056 status = ocfs2_dlm_init_debug(osb);
3057 if (status < 0) {
3058 mlog_errno(status);
3059 goto bail;
3060 }
3061
34d024f8
MF
3062 /* launch downconvert thread */
3063 osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
3064 if (IS_ERR(osb->dc_task)) {
3065 status = PTR_ERR(osb->dc_task);
3066 osb->dc_task = NULL;
ccd979bd
MF
3067 mlog_errno(status);
3068 goto bail;
3069 }
3070
ccd979bd 3071 /* for now, uuid == domain */
9c6c877c
JB
3072 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
3073 osb->uuid_str,
4670c46d 3074 strlen(osb->uuid_str),
553b5eb9 3075 &lproto, ocfs2_do_node_down, osb,
4670c46d
JB
3076 &conn);
3077 if (status) {
ccd979bd
MF
3078 mlog_errno(status);
3079 goto bail;
3080 }
3081
0abd6d18
MF
3082 status = ocfs2_cluster_this_node(&osb->node_num);
3083 if (status < 0) {
3084 mlog_errno(status);
3085 mlog(ML_ERROR,
3086 "could not find this host's node number\n");
286eaa95 3087 ocfs2_cluster_disconnect(conn, 0);
0abd6d18
MF
3088 goto bail;
3089 }
3090
c271c5c2 3091local:
ccd979bd
MF
3092 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
3093 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
6ca497a8 3094 ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
83273932 3095 ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
ccd979bd 3096
4670c46d 3097 osb->cconn = conn;
ccd979bd
MF
3098
3099 status = 0;
3100bail:
3101 if (status < 0) {
3102 ocfs2_dlm_shutdown_debug(osb);
34d024f8
MF
3103 if (osb->dc_task)
3104 kthread_stop(osb->dc_task);
ccd979bd
MF
3105 }
3106
3107 mlog_exit(status);
3108 return status;
3109}
3110
286eaa95
JB
3111void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3112 int hangup_pending)
ccd979bd
MF
3113{
3114 mlog_entry_void();
3115
ccd979bd
MF
3116 ocfs2_drop_osb_locks(osb);
3117
4670c46d
JB
3118 /*
3119 * Now that we have dropped all locks and ocfs2_dismount_volume()
3120 * has disabled recovery, the DLM won't be talking to us. It's
3121 * safe to tear things down before disconnecting the cluster.
3122 */
3123
34d024f8
MF
3124 if (osb->dc_task) {
3125 kthread_stop(osb->dc_task);
3126 osb->dc_task = NULL;
ccd979bd
MF
3127 }
3128
3129 ocfs2_lock_res_free(&osb->osb_super_lockres);
3130 ocfs2_lock_res_free(&osb->osb_rename_lockres);
6ca497a8 3131 ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
83273932 3132 ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
ccd979bd 3133
286eaa95 3134 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
4670c46d 3135 osb->cconn = NULL;
ccd979bd
MF
3136
3137 ocfs2_dlm_shutdown_debug(osb);
3138
3139 mlog_exit_void();
3140}
3141
ccd979bd 3142static int ocfs2_drop_lock(struct ocfs2_super *osb,
0d5dc6c2 3143 struct ocfs2_lock_res *lockres)
ccd979bd 3144{
7431cd7e 3145 int ret;
ccd979bd 3146 unsigned long flags;
bd3e7610 3147 u32 lkm_flags = 0;
ccd979bd
MF
3148
3149 /* We didn't get anywhere near actually using this lockres. */
3150 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
3151 goto out;
3152
b80fc012 3153 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
bd3e7610 3154 lkm_flags |= DLM_LKF_VALBLK;
b80fc012 3155
ccd979bd
MF
3156 spin_lock_irqsave(&lockres->l_lock, flags);
3157
3158 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
3159 "lockres %s, flags 0x%lx\n",
3160 lockres->l_name, lockres->l_flags);
3161
3162 while (lockres->l_flags & OCFS2_LOCK_BUSY) {
3163 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
3164 "%u, unlock_action = %u\n",
3165 lockres->l_name, lockres->l_flags, lockres->l_action,
3166 lockres->l_unlock_action);
3167
3168 spin_unlock_irqrestore(&lockres->l_lock, flags);
3169
3170 /* XXX: Today we just wait on any busy
3171 * locks... Perhaps we need to cancel converts in the
3172 * future? */
3173 ocfs2_wait_on_busy_lock(lockres);
3174
3175 spin_lock_irqsave(&lockres->l_lock, flags);
3176 }
3177
0d5dc6c2
MF
3178 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
3179 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
bd3e7610 3180 lockres->l_level == DLM_LOCK_EX &&
0d5dc6c2
MF
3181 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
3182 lockres->l_ops->set_lvb(lockres);
3183 }
ccd979bd
MF
3184
3185 if (lockres->l_flags & OCFS2_LOCK_BUSY)
3186 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
3187 lockres->l_name);
3188 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
3189 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
3190
3191 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
3192 spin_unlock_irqrestore(&lockres->l_lock, flags);
3193 goto out;
3194 }
3195
3196 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
3197
3198 /* make sure we never get here while waiting for an ast to
3199 * fire. */
3200 BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
3201
3202 /* is this necessary? */
3203 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
3204 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
3205 spin_unlock_irqrestore(&lockres->l_lock, flags);
3206
3207 mlog(0, "lock %s\n", lockres->l_name);
3208
a796d286 3209 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags);
7431cd7e
JB
3210 if (ret) {
3211 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
ccd979bd 3212 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
cf0acdcd 3213 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
ccd979bd
MF
3214 BUG();
3215 }
73ac36ea 3216 mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
ccd979bd
MF
3217 lockres->l_name);
3218
3219 ocfs2_wait_on_busy_lock(lockres);
3220out:
3221 mlog_exit(0);
3222 return 0;
3223}
3224
3225/* Mark the lockres as being dropped. It will no longer be
3226 * queued if blocking, but we still may have to wait on it
34d024f8 3227 * being dequeued from the downconvert thread before we can consider
2bd63216 3228 * it safe to drop.
ccd979bd
MF
3229 *
3230 * You can *not* attempt to call cluster_lock on this lockres anymore. */
3231void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
3232{
3233 int status;
3234 struct ocfs2_mask_waiter mw;
3235 unsigned long flags;
3236
3237 ocfs2_init_mask_waiter(&mw);
3238
3239 spin_lock_irqsave(&lockres->l_lock, flags);
3240 lockres->l_flags |= OCFS2_LOCK_FREEING;
3241 while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
3242 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
3243 spin_unlock_irqrestore(&lockres->l_lock, flags);
3244
3245 mlog(0, "Waiting on lockres %s\n", lockres->l_name);
3246
3247 status = ocfs2_wait_for_mask(&mw);
3248 if (status)
3249 mlog_errno(status);
3250
3251 spin_lock_irqsave(&lockres->l_lock, flags);
3252 }
3253 spin_unlock_irqrestore(&lockres->l_lock, flags);
3254}
3255
d680efe9
MF
3256void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
3257 struct ocfs2_lock_res *lockres)
ccd979bd 3258{
d680efe9 3259 int ret;
ccd979bd 3260
d680efe9 3261 ocfs2_mark_lockres_freeing(lockres);
0d5dc6c2 3262 ret = ocfs2_drop_lock(osb, lockres);
d680efe9
MF
3263 if (ret)
3264 mlog_errno(ret);
3265}
ccd979bd 3266
d680efe9
MF
3267static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
3268{
3269 ocfs2_simple_drop_lockres(osb, &osb->osb_super_lockres);
3270 ocfs2_simple_drop_lockres(osb, &osb->osb_rename_lockres);
6ca497a8 3271 ocfs2_simple_drop_lockres(osb, &osb->osb_nfs_sync_lockres);
83273932 3272 ocfs2_simple_drop_lockres(osb, &osb->osb_orphan_scan.os_lockres);
ccd979bd
MF
3273}
3274
ccd979bd
MF
3275int ocfs2_drop_inode_locks(struct inode *inode)
3276{
3277 int status, err;
ccd979bd
MF
3278
3279 mlog_entry_void();
3280
3281 /* No need to call ocfs2_mark_lockres_freeing here -
3282 * ocfs2_clear_inode has done it for us. */
3283
3284 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
50008630 3285 &OCFS2_I(inode)->ip_open_lockres);
ccd979bd
MF
3286 if (err < 0)
3287 mlog_errno(err);
3288
3289 status = err;
3290
3291 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
e63aecb6 3292 &OCFS2_I(inode)->ip_inode_lockres);
ccd979bd
MF
3293 if (err < 0)
3294 mlog_errno(err);
3295 if (err < 0 && !status)
3296 status = err;
3297
3298 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
0d5dc6c2 3299 &OCFS2_I(inode)->ip_rw_lockres);
ccd979bd
MF
3300 if (err < 0)
3301 mlog_errno(err);
3302 if (err < 0 && !status)
3303 status = err;
3304
3305 mlog_exit(status);
3306 return status;
3307}
3308
de551246
JB
3309static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
3310 int new_level)
ccd979bd
MF
3311{
3312 assert_spin_locked(&lockres->l_lock);
3313
bd3e7610 3314 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
ccd979bd
MF
3315
3316 if (lockres->l_level <= new_level) {
bd3e7610 3317 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
ccd979bd
MF
3318 lockres->l_level, new_level);
3319 BUG();
3320 }
3321
3322 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
3323 lockres->l_name, new_level, lockres->l_blocking);
3324
3325 lockres->l_action = OCFS2_AST_DOWNCONVERT;
3326 lockres->l_requested = new_level;
3327 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
de551246 3328 return lockres_set_pending(lockres);
ccd979bd
MF
3329}
3330
3331static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3332 struct ocfs2_lock_res *lockres,
3333 int new_level,
de551246
JB
3334 int lvb,
3335 unsigned int generation)
ccd979bd 3336{
bd3e7610
JB
3337 int ret;
3338 u32 dlm_flags = DLM_LKF_CONVERT;
ccd979bd
MF
3339
3340 mlog_entry_void();
3341
3342 if (lvb)
bd3e7610 3343 dlm_flags |= DLM_LKF_VALBLK;
ccd979bd 3344
4670c46d 3345 ret = ocfs2_dlm_lock(osb->cconn,
7431cd7e
JB
3346 new_level,
3347 &lockres->l_lksb,
3348 dlm_flags,
3349 lockres->l_name,
a796d286 3350 OCFS2_LOCK_ID_MAX_LEN - 1);
de551246 3351 lockres_clear_pending(lockres, generation, osb);
7431cd7e
JB
3352 if (ret) {
3353 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
ccd979bd
MF
3354 ocfs2_recover_from_dlm_error(lockres, 1);
3355 goto bail;
3356 }
3357
3358 ret = 0;
3359bail:
3360 mlog_exit(ret);
3361 return ret;
3362}
3363
24ef1815 3364/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
ccd979bd
MF
3365static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3366 struct ocfs2_lock_res *lockres)
3367{
3368 assert_spin_locked(&lockres->l_lock);
3369
3370 mlog_entry_void();
3371 mlog(0, "lock %s\n", lockres->l_name);
3372
3373 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3374 /* If we're already trying to cancel a lock conversion
3375 * then just drop the spinlock and allow the caller to
3376 * requeue this lock. */
3377
3378 mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
3379 return 0;
3380 }
3381
3382 /* were we in a convert when we got the bast fire? */
3383 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
3384 lockres->l_action != OCFS2_AST_DOWNCONVERT);
3385 /* set things up for the unlockast to know to just
3386 * clear out the ast_action and unset busy, etc. */
3387 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
3388
3389 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
3390 "lock %s, invalid flags: 0x%lx\n",
3391 lockres->l_name, lockres->l_flags);
3392
3393 return 1;
3394}
3395
3396static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3397 struct ocfs2_lock_res *lockres)
3398{
3399 int ret;
ccd979bd
MF
3400
3401 mlog_entry_void();
3402 mlog(0, "lock %s\n", lockres->l_name);
3403
4670c46d 3404 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
a796d286 3405 DLM_LKF_CANCEL);
7431cd7e
JB
3406 if (ret) {
3407 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
ccd979bd
MF
3408 ocfs2_recover_from_dlm_error(lockres, 0);
3409 }
3410
24ef1815 3411 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
ccd979bd 3412
ccd979bd
MF
3413 mlog_exit(ret);
3414 return ret;
3415}
3416
b5e500e2
MF
3417static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3418 struct ocfs2_lock_res *lockres,
3419 struct ocfs2_unblock_ctl *ctl)
ccd979bd
MF
3420{
3421 unsigned long flags;
3422 int blocking;
3423 int new_level;
079b8057 3424 int level;
ccd979bd 3425 int ret = 0;
5ef0d4ea 3426 int set_lvb = 0;
de551246 3427 unsigned int gen;
ccd979bd
MF
3428
3429 mlog_entry_void();
3430
3431 spin_lock_irqsave(&lockres->l_lock, flags);
3432
ccd979bd 3433recheck:
db0f6ce6
SM
3434 /*
3435 * Is it still blocking? If not, we have no more work to do.
3436 */
3437 if (!(lockres->l_flags & OCFS2_LOCK_BLOCKED)) {
3438 BUG_ON(lockres->l_blocking != DLM_LOCK_NL);
3439 spin_unlock_irqrestore(&lockres->l_lock, flags);
3440 ret = 0;
3441 goto leave;
3442 }
3443
ccd979bd 3444 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
de551246
JB
3445 /* XXX
3446 * This is a *big* race. The OCFS2_LOCK_PENDING flag
3447 * exists entirely for one reason - another thread has set
3448 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
3449 *
3450 * If we do ocfs2_cancel_convert() before the other thread
3451 * calls dlm_lock(), our cancel will do nothing. We will
3452 * get no ast, and we will have no way of knowing the
3453 * cancel failed. Meanwhile, the other thread will call
3454 * into dlm_lock() and wait...forever.
3455 *
3456 * Why forever? Because another node has asked for the
3457 * lock first; that's why we're here in unblock_lock().
3458 *
3459 * The solution is OCFS2_LOCK_PENDING. When PENDING is
3460 * set, we just requeue the unblock. Only when the other
3461 * thread has called dlm_lock() and cleared PENDING will
3462 * we then cancel their request.
3463 *
3464 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
3465 * at the same time they set OCFS2_DLM_BUSY. They must
3466 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3467 */
3468 if (lockres->l_flags & OCFS2_LOCK_PENDING)
3469 goto leave_requeue;
3470
d680efe9 3471 ctl->requeue = 1;
ccd979bd
MF
3472 ret = ocfs2_prepare_cancel_convert(osb, lockres);
3473 spin_unlock_irqrestore(&lockres->l_lock, flags);
3474 if (ret) {
3475 ret = ocfs2_cancel_convert(osb, lockres);
3476 if (ret < 0)
3477 mlog_errno(ret);
3478 }
3479 goto leave;
3480 }
3481
a1912826
SM
3482 /*
3483 * This prevents livelocks. OCFS2_LOCK_UPCONVERT_FINISHING flag is
3484 * set when the ast is received for an upconvert just before the
3485 * OCFS2_LOCK_BUSY flag is cleared. Now if the fs received a bast
3486 * on the heels of the ast, we want to delay the downconvert just
3487 * enough to allow the up requestor to do its task. Because this
3488 * lock is in the blocked queue, the lock will be downconverted
3489 * as soon as the requestor is done with the lock.
3490 */
3491 if (lockres->l_flags & OCFS2_LOCK_UPCONVERT_FINISHING)
3492 goto leave_requeue;
3493
0d74125a
SM
3494 /*
3495 * How can we block and yet be at NL? We were trying to upconvert
3496 * from NL and got canceled. The code comes back here, and now
3497 * we notice and clear BLOCKING.
3498 */
3499 if (lockres->l_level == DLM_LOCK_NL) {
3500 BUG_ON(lockres->l_ex_holders || lockres->l_ro_holders);
3501 lockres->l_blocking = DLM_LOCK_NL;
3502 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
3503 spin_unlock_irqrestore(&lockres->l_lock, flags);
3504 goto leave;
3505 }
3506
ccd979bd
MF
3507 /* if we're blocking an exclusive and we have *any* holders,
3508 * then requeue. */
bd3e7610 3509 if ((lockres->l_blocking == DLM_LOCK_EX)
f7fbfdd1
MF
3510 && (lockres->l_ex_holders || lockres->l_ro_holders))
3511 goto leave_requeue;
ccd979bd
MF
3512
3513 /* If it's a PR we're blocking, then only
3514 * requeue if we've got any EX holders */
bd3e7610 3515 if (lockres->l_blocking == DLM_LOCK_PR &&
f7fbfdd1
MF
3516 lockres->l_ex_holders)
3517 goto leave_requeue;
3518
3519 /*
3520 * Can we get a lock in this state if the holder counts are
3521 * zero? The meta data unblock code used to check this.
3522 */
3523 if ((lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
3524 && (lockres->l_flags & OCFS2_LOCK_REFRESHING))
3525 goto leave_requeue;
ccd979bd 3526
16d5b956
MF
3527 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
3528
3529 if (lockres->l_ops->check_downconvert
3530 && !lockres->l_ops->check_downconvert(lockres, new_level))
3531 goto leave_requeue;
3532
ccd979bd
MF
3533 /* If we get here, then we know that there are no more
3534 * incompatible holders (and anyone asking for an incompatible
3535 * lock is blocked). We can now downconvert the lock */
cc567d89 3536 if (!lockres->l_ops->downconvert_worker)
ccd979bd
MF
3537 goto downconvert;
3538
3539 /* Some lockres types want to do a bit of work before
3540 * downconverting a lock. Allow that here. The worker function
3541 * may sleep, so we save off a copy of what we're blocking as
3542 * it may change while we're not holding the spin lock. */
3543 blocking = lockres->l_blocking;
079b8057 3544 level = lockres->l_level;
ccd979bd
MF
3545 spin_unlock_irqrestore(&lockres->l_lock, flags);
3546
cc567d89 3547 ctl->unblock_action = lockres->l_ops->downconvert_worker(lockres, blocking);
d680efe9
MF
3548
3549 if (ctl->unblock_action == UNBLOCK_STOP_POST)
3550 goto leave;
ccd979bd
MF
3551
3552 spin_lock_irqsave(&lockres->l_lock, flags);
079b8057 3553 if ((blocking != lockres->l_blocking) || (level != lockres->l_level)) {
ccd979bd
MF
3554 /* If this changed underneath us, then we can't drop
3555 * it just yet. */
3556 goto recheck;
3557 }
3558
3559downconvert:
d680efe9 3560 ctl->requeue = 0;
ccd979bd 3561
5ef0d4ea 3562 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
bd3e7610 3563 if (lockres->l_level == DLM_LOCK_EX)
5ef0d4ea
MF
3564 set_lvb = 1;
3565
3566 /*
3567 * We only set the lvb if the lock has been fully
3568 * refreshed - otherwise we risk setting stale
3569 * data. Otherwise, there's no need to actually clear
3570 * out the lvb here as it's value is still valid.
3571 */
3572 if (set_lvb && !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
3573 lockres->l_ops->set_lvb(lockres);
3574 }
3575
de551246 3576 gen = ocfs2_prepare_downconvert(lockres, new_level);
ccd979bd 3577 spin_unlock_irqrestore(&lockres->l_lock, flags);
de551246
JB
3578 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
3579 gen);
3580
ccd979bd
MF
3581leave:
3582 mlog_exit(ret);
3583 return ret;
f7fbfdd1
MF
3584
3585leave_requeue:
3586 spin_unlock_irqrestore(&lockres->l_lock, flags);
3587 ctl->requeue = 1;
3588
3589 mlog_exit(0);
3590 return 0;
ccd979bd
MF
3591}
3592
d680efe9
MF
3593static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3594 int blocking)
ccd979bd
MF
3595{
3596 struct inode *inode;
3597 struct address_space *mapping;
3598
ccd979bd
MF
3599 inode = ocfs2_lock_res_inode(lockres);
3600 mapping = inode->i_mapping;
3601
1044e401 3602 if (!S_ISREG(inode->i_mode))
f1f54068
MF
3603 goto out;
3604
7f4a2a97
MF
3605 /*
3606 * We need this before the filemap_fdatawrite() so that it can
3607 * transfer the dirty bit from the PTE to the
3608 * page. Unfortunately this means that even for EX->PR
3609 * downconverts, we'll lose our mappings and have to build
3610 * them up again.
3611 */
3612 unmap_mapping_range(mapping, 0, 0, 0);
3613
ccd979bd 3614 if (filemap_fdatawrite(mapping)) {
b0697053
MF
3615 mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
3616 (unsigned long long)OCFS2_I(inode)->ip_blkno);
ccd979bd
MF
3617 }
3618 sync_mapping_buffers(mapping);
bd3e7610 3619 if (blocking == DLM_LOCK_EX) {
ccd979bd 3620 truncate_inode_pages(mapping, 0);
ccd979bd
MF
3621 } else {
3622 /* We only need to wait on the I/O if we're not also
3623 * truncating pages because truncate_inode_pages waits
3624 * for us above. We don't truncate pages if we're
3625 * blocking anything < EXMODE because we want to keep
3626 * them around in that case. */
3627 filemap_fdatawait(mapping);
3628 }
3629
f1f54068 3630out:
d680efe9 3631 return UNBLOCK_CONTINUE;
ccd979bd
MF
3632}
3633
a4338481
TM
3634static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
3635 struct ocfs2_lock_res *lockres,
3636 int new_level)
810d5aeb 3637{
a4338481 3638 int checkpointed = ocfs2_ci_fully_checkpointed(ci);
810d5aeb 3639
bd3e7610
JB
3640 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3641 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
810d5aeb
MF
3642
3643 if (checkpointed)
3644 return 1;
3645
a4338481 3646 ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
810d5aeb
MF
3647 return 0;
3648}
3649
a4338481
TM
3650static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3651 int new_level)
3652{
3653 struct inode *inode = ocfs2_lock_res_inode(lockres);
3654
3655 return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
3656}
3657
810d5aeb
MF
3658static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
3659{
3660 struct inode *inode = ocfs2_lock_res_inode(lockres);
3661
3662 __ocfs2_stuff_meta_lvb(inode);
3663}
3664
d680efe9
MF
3665/*
3666 * Does the final reference drop on our dentry lock. Right now this
34d024f8 3667 * happens in the downconvert thread, but we could choose to simplify the
d680efe9
MF
3668 * dlmglue API and push these off to the ocfs2_wq in the future.
3669 */
3670static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
3671 struct ocfs2_lock_res *lockres)
3672{
3673 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
3674 ocfs2_dentry_lock_put(osb, dl);
3675}
3676
3677/*
3678 * d_delete() matching dentries before the lock downconvert.
3679 *
3680 * At this point, any process waiting to destroy the
3681 * dentry_lock due to last ref count is stopped by the
3682 * OCFS2_LOCK_QUEUED flag.
3683 *
3684 * We have two potential problems
3685 *
3686 * 1) If we do the last reference drop on our dentry_lock (via dput)
3687 * we'll wind up in ocfs2_release_dentry_lock(), waiting on
3688 * the downconvert to finish. Instead we take an elevated
3689 * reference and push the drop until after we've completed our
3690 * unblock processing.
3691 *
3692 * 2) There might be another process with a final reference,
3693 * waiting on us to finish processing. If this is the case, we
3694 * detect it and exit out - there's no more dentries anyway.
3695 */
3696static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3697 int blocking)
3698{
3699 struct ocfs2_dentry_lock *dl = ocfs2_lock_res_dl(lockres);
3700 struct ocfs2_inode_info *oi = OCFS2_I(dl->dl_inode);
3701 struct dentry *dentry;
3702 unsigned long flags;
3703 int extra_ref = 0;
3704
3705 /*
3706 * This node is blocking another node from getting a read
3707 * lock. This happens when we've renamed within a
3708 * directory. We've forced the other nodes to d_delete(), but
3709 * we never actually dropped our lock because it's still
3710 * valid. The downconvert code will retain a PR for this node,
3711 * so there's no further work to do.
3712 */
bd3e7610 3713 if (blocking == DLM_LOCK_PR)
d680efe9
MF
3714 return UNBLOCK_CONTINUE;
3715
3716 /*
3717 * Mark this inode as potentially orphaned. The code in
3718 * ocfs2_delete_inode() will figure out whether it actually
3719 * needs to be freed or not.
3720 */
3721 spin_lock(&oi->ip_lock);
3722 oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
3723 spin_unlock(&oi->ip_lock);
3724
3725 /*
3726 * Yuck. We need to make sure however that the check of
3727 * OCFS2_LOCK_FREEING and the extra reference are atomic with
3728 * respect to a reference decrement or the setting of that
3729 * flag.
3730 */
3731 spin_lock_irqsave(&lockres->l_lock, flags);
3732 spin_lock(&dentry_attach_lock);
3733 if (!(lockres->l_flags & OCFS2_LOCK_FREEING)
3734 && dl->dl_count) {
3735 dl->dl_count++;
3736 extra_ref = 1;
3737 }
3738 spin_unlock(&dentry_attach_lock);
3739 spin_unlock_irqrestore(&lockres->l_lock, flags);
3740
3741 mlog(0, "extra_ref = %d\n", extra_ref);
3742
3743 /*
3744 * We have a process waiting on us in ocfs2_dentry_iput(),
3745 * which means we can't have any more outstanding
3746 * aliases. There's no need to do any more work.
3747 */
3748 if (!extra_ref)
3749 return UNBLOCK_CONTINUE;
3750
3751 spin_lock(&dentry_attach_lock);
3752 while (1) {
3753 dentry = ocfs2_find_local_alias(dl->dl_inode,
3754 dl->dl_parent_blkno, 1);
3755 if (!dentry)
3756 break;
3757 spin_unlock(&dentry_attach_lock);
3758
3759 mlog(0, "d_delete(%.*s);\n", dentry->d_name.len,
3760 dentry->d_name.name);
3761
3762 /*
3763 * The following dcache calls may do an
3764 * iput(). Normally we don't want that from the
3765 * downconverting thread, but in this case it's ok
3766 * because the requesting node already has an
3767 * exclusive lock on the inode, so it can't be queued
3768 * for a downconvert.
3769 */
3770 d_delete(dentry);
3771 dput(dentry);
3772
3773 spin_lock(&dentry_attach_lock);
3774 }
3775 spin_unlock(&dentry_attach_lock);
3776
3777 /*
3778 * If we are the last holder of this dentry lock, there is no
3779 * reason to downconvert so skip straight to the unlock.
3780 */
3781 if (dl->dl_count == 1)
3782 return UNBLOCK_STOP_POST;
3783
3784 return UNBLOCK_CONTINUE_POST;
3785}
3786
8dec98ed
TM
3787static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
3788 int new_level)
3789{
3790 struct ocfs2_refcount_tree *tree =
3791 ocfs2_lock_res_refcount_tree(lockres);
3792
3793 return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
3794}
3795
3796static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
3797 int blocking)
3798{
3799 struct ocfs2_refcount_tree *tree =
3800 ocfs2_lock_res_refcount_tree(lockres);
3801
3802 ocfs2_metadata_cache_purge(&tree->rf_ci);
3803
3804 return UNBLOCK_CONTINUE;
3805}
3806
9e33d69f
JK
3807static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3808{
3809 struct ocfs2_qinfo_lvb *lvb;
3810 struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
3811 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3812 oinfo->dqi_gi.dqi_type);
3813
3814 mlog_entry_void();
3815
a641dc2a 3816 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
9e33d69f
JK
3817 lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
3818 lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
3819 lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
3820 lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
3821 lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
3822 lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
3823 lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
3824
3825 mlog_exit_void();
3826}
3827
3828void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3829{
3830 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3831 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3832 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3833
3834 mlog_entry_void();
3835 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
3836 ocfs2_cluster_unlock(osb, lockres, level);
3837 mlog_exit_void();
3838}
3839
3840static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
3841{
3842 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3843 oinfo->dqi_gi.dqi_type);
3844 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3845 struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
85eb8b73 3846 struct buffer_head *bh = NULL;
9e33d69f
JK
3847 struct ocfs2_global_disk_dqinfo *gdinfo;
3848 int status = 0;
3849
1c520dfb
JB
3850 if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
3851 lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
9e33d69f
JK
3852 info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
3853 info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
3854 oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
3855 oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
3856 oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
3857 oinfo->dqi_gi.dqi_free_entry =
3858 be32_to_cpu(lvb->lvb_free_entry);
3859 } else {
85eb8b73
JB
3860 status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
3861 if (status) {
9e33d69f
JK
3862 mlog_errno(status);
3863 goto bail;
3864 }
3865 gdinfo = (struct ocfs2_global_disk_dqinfo *)
3866 (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
3867 info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
3868 info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
3869 oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
3870 oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
3871 oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
3872 oinfo->dqi_gi.dqi_free_entry =
3873 le32_to_cpu(gdinfo->dqi_free_entry);
3874 brelse(bh);
3875 ocfs2_track_lock_refresh(lockres);
3876 }
3877
3878bail:
3879 return status;
3880}
3881
3882/* Lock quota info, this function expects at least shared lock on the quota file
3883 * so that we can safely refresh quota info from disk. */
3884int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3885{
3886 struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
3887 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3888 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3889 int status = 0;
3890
3891 mlog_entry_void();
3892
3893 /* On RO devices, locking really isn't needed... */
3894 if (ocfs2_is_hard_readonly(osb)) {
3895 if (ex)
3896 status = -EROFS;
3897 goto bail;
3898 }
3899 if (ocfs2_mount_local(osb))
3900 goto bail;
3901
3902 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3903 if (status < 0) {
3904 mlog_errno(status);
3905 goto bail;
3906 }
3907 if (!ocfs2_should_refresh_lock_res(lockres))
3908 goto bail;
3909 /* OK, we have the lock but we need to refresh the quota info */
3910 status = ocfs2_refresh_qinfo(oinfo);
3911 if (status)
3912 ocfs2_qinfo_unlock(oinfo, ex);
3913 ocfs2_complete_lock_res_refresh(lockres, status);
3914bail:
3915 mlog_exit(status);
3916 return status;
3917}
3918
8dec98ed
TM
3919int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
3920{
3921 int status;
3922 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3923 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3924 struct ocfs2_super *osb = lockres->l_priv;
3925
3926
3927 if (ocfs2_is_hard_readonly(osb))
3928 return -EROFS;
3929
3930 if (ocfs2_mount_local(osb))
3931 return 0;
3932
3933 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3934 if (status < 0)
3935 mlog_errno(status);
3936
3937 return status;
3938}
3939
3940void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3941{
3942 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3943 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3944 struct ocfs2_super *osb = lockres->l_priv;
3945
3946 if (!ocfs2_mount_local(osb))
3947 ocfs2_cluster_unlock(osb, lockres, level);
3948}
3949
00600056
AB
3950static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3951 struct ocfs2_lock_res *lockres)
ccd979bd
MF
3952{
3953 int status;
d680efe9 3954 struct ocfs2_unblock_ctl ctl = {0, 0,};
ccd979bd
MF
3955 unsigned long flags;
3956
3957 /* Our reference to the lockres in this function can be
3958 * considered valid until we remove the OCFS2_LOCK_QUEUED
3959 * flag. */
3960
3961 mlog_entry_void();
3962
3963 BUG_ON(!lockres);
3964 BUG_ON(!lockres->l_ops);
ccd979bd
MF
3965
3966 mlog(0, "lockres %s blocked.\n", lockres->l_name);
3967
3968 /* Detect whether a lock has been marked as going away while
34d024f8 3969 * the downconvert thread was processing other things. A lock can
ccd979bd
MF
3970 * still be marked with OCFS2_LOCK_FREEING after this check,
3971 * but short circuiting here will still save us some
3972 * performance. */
3973 spin_lock_irqsave(&lockres->l_lock, flags);
3974 if (lockres->l_flags & OCFS2_LOCK_FREEING)
3975 goto unqueue;
3976 spin_unlock_irqrestore(&lockres->l_lock, flags);
3977
b5e500e2 3978 status = ocfs2_unblock_lock(osb, lockres, &ctl);
ccd979bd
MF
3979 if (status < 0)
3980 mlog_errno(status);
3981
3982 spin_lock_irqsave(&lockres->l_lock, flags);
3983unqueue:
d680efe9 3984 if (lockres->l_flags & OCFS2_LOCK_FREEING || !ctl.requeue) {
ccd979bd
MF
3985 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
3986 } else
3987 ocfs2_schedule_blocked_lock(osb, lockres);
3988
3989 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
d680efe9 3990 ctl.requeue ? "yes" : "no");
ccd979bd
MF
3991 spin_unlock_irqrestore(&lockres->l_lock, flags);
3992
d680efe9
MF
3993 if (ctl.unblock_action != UNBLOCK_CONTINUE
3994 && lockres->l_ops->post_unlock)
3995 lockres->l_ops->post_unlock(osb, lockres);
3996
ccd979bd
MF
3997 mlog_exit_void();
3998}
3999
4000static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
4001 struct ocfs2_lock_res *lockres)
4002{
4003 mlog_entry_void();
4004
4005 assert_spin_locked(&lockres->l_lock);
4006
4007 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
4008 /* Do not schedule a lock for downconvert when it's on
4009 * the way to destruction - any nodes wanting access
4010 * to the resource will get it soon. */
4011 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
4012 lockres->l_name, lockres->l_flags);
4013 return;
4014 }
4015
4016 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
4017
34d024f8 4018 spin_lock(&osb->dc_task_lock);
ccd979bd
MF
4019 if (list_empty(&lockres->l_blocked_list)) {
4020 list_add_tail(&lockres->l_blocked_list,
4021 &osb->blocked_lock_list);
4022 osb->blocked_lock_count++;
4023 }
34d024f8 4024 spin_unlock(&osb->dc_task_lock);
ccd979bd
MF
4025
4026 mlog_exit_void();
4027}
34d024f8
MF
4028
4029static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
4030{
4031 unsigned long processed;
4032 struct ocfs2_lock_res *lockres;
4033
4034 mlog_entry_void();
4035
4036 spin_lock(&osb->dc_task_lock);
4037 /* grab this early so we know to try again if a state change and
4038 * wake happens part-way through our work */
4039 osb->dc_work_sequence = osb->dc_wake_sequence;
4040
4041 processed = osb->blocked_lock_count;
4042 while (processed) {
4043 BUG_ON(list_empty(&osb->blocked_lock_list));
4044
4045 lockres = list_entry(osb->blocked_lock_list.next,
4046 struct ocfs2_lock_res, l_blocked_list);
4047 list_del_init(&lockres->l_blocked_list);
4048 osb->blocked_lock_count--;
4049 spin_unlock(&osb->dc_task_lock);
4050
4051 BUG_ON(!processed);
4052 processed--;
4053
4054 ocfs2_process_blocked_lock(osb, lockres);
4055
4056 spin_lock(&osb->dc_task_lock);
4057 }
4058 spin_unlock(&osb->dc_task_lock);
4059
4060 mlog_exit_void();
4061}
4062
4063static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
4064{
4065 int empty = 0;
4066
4067 spin_lock(&osb->dc_task_lock);
4068 if (list_empty(&osb->blocked_lock_list))
4069 empty = 1;
4070
4071 spin_unlock(&osb->dc_task_lock);
4072 return empty;
4073}
4074
4075static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb)
4076{
4077 int should_wake = 0;
4078
4079 spin_lock(&osb->dc_task_lock);
4080 if (osb->dc_work_sequence != osb->dc_wake_sequence)
4081 should_wake = 1;
4082 spin_unlock(&osb->dc_task_lock);
4083
4084 return should_wake;
4085}
4086
200bfae3 4087static int ocfs2_downconvert_thread(void *arg)
34d024f8
MF
4088{
4089 int status = 0;
4090 struct ocfs2_super *osb = arg;
4091
4092 /* only quit once we've been asked to stop and there is no more
4093 * work available */
4094 while (!(kthread_should_stop() &&
4095 ocfs2_downconvert_thread_lists_empty(osb))) {
4096
4097 wait_event_interruptible(osb->dc_event,
4098 ocfs2_downconvert_thread_should_wake(osb) ||
4099 kthread_should_stop());
4100
4101 mlog(0, "downconvert_thread: awoken\n");
4102
4103 ocfs2_downconvert_thread_do_work(osb);
4104 }
4105
4106 osb->dc_task = NULL;
4107 return status;
4108}
4109
4110void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb)
4111{
4112 spin_lock(&osb->dc_task_lock);
4113 /* make sure the voting thread gets a swipe at whatever changes
4114 * the caller may have made to the voting state */
4115 osb->dc_wake_sequence++;
4116 spin_unlock(&osb->dc_task_lock);
4117 wake_up(&osb->dc_event);
4118}