From b6723c8da55af5309cf06e71a5228f3c02846c5a Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Thu, 10 Mar 2016 12:14:44 +0800 Subject: [PATCH] drm/amdgpu: use ref to keep job alive this is to fix fatal page fault error that occured if: job is signaled/released after its timeout work is already put to the global queue (in this case the cancel_delayed_work will return false), which will lead to NX-protection error page fault during job_timeout_func. Signed-off-by: Monk Liu Reviewed-by: Chunming Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 ++++++++++++--- drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 8 +++++++- drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 13 +++++++++++++ 5 files changed, 35 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ccb28468ece8..2ac07ebecfd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -750,7 +750,9 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, struct amdgpu_job **job); int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, struct amdgpu_job **job); + void amdgpu_job_free(struct amdgpu_job *job); +void amdgpu_job_free_func(struct kref *refcount); int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, struct amd_sched_entity *entity, void *owner, struct fence **f); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 9025671d21c3..d7e0b0b9a1bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -872,6 +872,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, r = amd_sched_job_init(&job->base, &ring->sched, &p->ctx->rings[ring->idx].entity, amdgpu_job_timeout_func, + amdgpu_job_free_func, p->filp, &fence); if (r) { amdgpu_job_free(job); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 961cae4a1955..a052ac2b131d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -31,7 +31,7 @@ static void amdgpu_job_free_handler(struct work_struct *ws) { struct amdgpu_job *job = container_of(ws, struct amdgpu_job, base.work_free_job); - kfree(job); + amd_sched_job_put(&job->base); } void amdgpu_job_timeout_func(struct work_struct *work) @@ -41,6 +41,8 @@ void amdgpu_job_timeout_func(struct work_struct *work) job->base.sched->name, (uint32_t)atomic_read(&job->ring->fence_drv.last_seq), job->ring->fence_drv.sync_seq); + + amd_sched_job_put(&job->base); } int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, @@ -101,6 +103,12 @@ void amdgpu_job_free(struct amdgpu_job *job) kfree(job); } +void amdgpu_job_free_func(struct kref *refcount) +{ + struct amdgpu_job *job = container_of(refcount, struct amdgpu_job, base.refcount); + kfree(job); +} + int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, struct amd_sched_entity *entity, void *owner, struct fence **f) @@ -113,9 +121,10 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring, return -EINVAL; r = amd_sched_job_init(&job->base, &ring->sched, - entity, owner, + entity, amdgpu_job_timeout_func, - &fence); + amdgpu_job_free_func, + owner, &fence); if (r) return r; diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c index b7e8071448c6..639c70de217c 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c @@ -333,7 +333,8 @@ void amd_sched_job_finish(struct amd_sched_job *s_job) struct amd_gpu_scheduler *sched = s_job->sched; if (sched->timeout != MAX_SCHEDULE_TIMEOUT) { - cancel_delayed_work(&s_job->work_tdr); /*TODO: how to deal the case that tdr is running */ + if (cancel_delayed_work(&s_job->work_tdr)) + amd_sched_job_put(s_job); /* queue TDR for next job */ next = list_first_entry_or_null(&sched->ring_mirror_list, @@ -341,6 +342,7 @@ void amd_sched_job_finish(struct amd_sched_job *s_job) if (next) { INIT_DELAYED_WORK(&next->work_tdr, s_job->timeout_callback); + amd_sched_job_get(next); schedule_delayed_work(&next->work_tdr, sched->timeout); } } @@ -354,6 +356,7 @@ void amd_sched_job_begin(struct amd_sched_job *s_job) list_first_entry_or_null(&sched->ring_mirror_list, struct amd_sched_job, node) == s_job) { INIT_DELAYED_WORK(&s_job->work_tdr, s_job->timeout_callback); + amd_sched_job_get(s_job); schedule_delayed_work(&s_job->work_tdr, sched->timeout); } } @@ -382,9 +385,11 @@ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, void (*timeout_cb)(struct work_struct *work), + void (*free_cb)(struct kref *refcount), void *owner, struct fence **fence) { INIT_LIST_HEAD(&job->node); + kref_init(&job->refcount); job->sched = sched; job->s_entity = entity; job->s_fence = amd_sched_fence_create(entity, owner); @@ -393,6 +398,7 @@ int amd_sched_job_init(struct amd_sched_job *job, job->s_fence->s_job = job; job->timeout_callback = timeout_cb; + job->free_callback = free_cb; if (fence) *fence = &job->s_fence->base; diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h index a5700aded5bf..95ebfd069690 100644 --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h @@ -78,6 +78,7 @@ struct amd_sched_fence { }; struct amd_sched_job { + struct kref refcount; struct amd_gpu_scheduler *sched; struct amd_sched_entity *s_entity; struct amd_sched_fence *s_fence; @@ -87,6 +88,7 @@ struct amd_sched_job { struct list_head node; struct delayed_work work_tdr; void (*timeout_callback) (struct work_struct *work); + void (*free_callback)(struct kref *refcount); }; extern const struct fence_ops amd_sched_fence_ops; @@ -155,9 +157,20 @@ int amd_sched_job_init(struct amd_sched_job *job, struct amd_gpu_scheduler *sched, struct amd_sched_entity *entity, void (*timeout_cb)(struct work_struct *work), + void (*free_cb)(struct kref* refcount), void *owner, struct fence **fence); void amd_sched_job_pre_schedule(struct amd_gpu_scheduler *sched , struct amd_sched_job *s_job); void amd_sched_job_finish(struct amd_sched_job *s_job); void amd_sched_job_begin(struct amd_sched_job *s_job); +static inline void amd_sched_job_get(struct amd_sched_job *job) { + if (job) + kref_get(&job->refcount); +} + +static inline void amd_sched_job_put(struct amd_sched_job *job) { + if (job) + kref_put(&job->refcount, job->free_callback); +} + #endif -- 2.20.1