/* WARNING: THIS FILE IS AUTOMATICALLY GENERATED FROM A .SM FILE.
 * Changes made here will most likely be overwritten.
 */

/* 
 * (C) 2003 Clemson University and The University of Chicago 
 *
 * See COPYING in top-level directory.
 */

/** \file
 *  \ingroup sysint
 *
 *  PVFS2 system interface routines for reading and writing files.
 */

#include <string.h>
#include <assert.h>

#include "client-state-machine.h"
#include "pvfs2-types-debug.h"
#include "pvfs2-debug.h"
#include "job.h"
#include "gossip.h"
#include "str-utils.h"
#include "pint-servreq.h"
#include "pint-cached-config.h"
#include "PINT-reqproto-encode.h"
#include "pint-util.h"

extern job_context_id pint_client_sm_context;

enum
{
    IO_NO_DATA = 132,
    IO_DATAFILE_TRANSFERS_COMPLETE,
    IO_RETRY,
    IO_GET_DATAFILE_SIZE,
    IO_ANALYZE_SIZE_RESULTS
};

static int io_init(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_datafile_setup_msgpairs(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_datafile_post_msgpairs(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_datafile_post_msgpairs_retry(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_datafile_complete_operations(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_analyze_results(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_analyze_size_results(
    PINT_client_sm *sm_p, job_status_s *js_p);
static int io_cleanup(
    PINT_client_sm *sm_p, job_status_s *js_p);

/* misc helper functions */
static inline int complete_context_send_or_recv(
    PINT_client_sm *sm_p, job_status_s *js_p);
static inline int process_context_recv(
    PINT_client_io_ctx *cur_ctx,
    struct PINT_decoded_msg *decoded_resp,
    struct PVFS_server_resp **resp);
static inline int build_context_flow(
    PINT_client_sm *sm_p, PINT_client_io_ctx *cur_ctx,
    PVFS_object_attr *attr, struct PVFS_server_resp *resp);
static inline int process_context_recv_and_post_flow(
    PINT_client_sm *sm_p, job_status_s *js_p, PINT_client_io_ctx **out_ctx);
static inline int check_context_status(
    PINT_client_io_ctx *cur_ctx, int io_type,
    PVFS_size *total_size);
static int io_find_target_datafiles(
    PVFS_Request mem_req, PVFS_Request file_req,
    PVFS_offset file_req_offset, PINT_dist *dist_p,
    PVFS_handle *input_handle_array, int input_handle_count,
    int *handle_index_array, int *handle_index_out_count);

/* misc constants and helper macros */
#define IO_RECV_COMPLETED                                    1

/* possible I/O state machine phases (status_user_tag) */
#define IO_SM_PHASE_REQ_MSGPAIR_RECV                         0
#define IO_SM_PHASE_REQ_MSGPAIR_SEND                         1
#define IO_SM_PHASE_FLOW                                     2
#define IO_SM_PHASE_FINAL_ACK                                3
#define IO_SM_NUM_PHASES                                     4

#define STATUS_USER_TAG_TYPE(tag, type)                      \
((tag % IO_SM_NUM_PHASES) == type)
#define STATUS_USER_TAG_GET_INDEX(tag, type)                 \
(tag / IO_SM_NUM_PHASES)
#define STATUS_USER_TAG_IS_SEND_OR_RECV(tag)                 \
(STATUS_USER_TAG_TYPE(tag, IO_SM_PHASE_REQ_MSGPAIR_RECV) ||  \
 STATUS_USER_TAG_TYPE(tag, IO_SM_PHASE_REQ_MSGPAIR_SEND))

#define CLEAN_PRIVATE_MEMBERS(sm_p)                          \
do {                                                         \
    int i;                                                   \
    if (sm_p->u.io.datafile_index_array)                     \
    {                                                        \
        free(sm_p->u.io.datafile_index_array);               \
        sm_p->u.io.datafile_index_array = NULL;              \
    }                                                        \
    for (i=0;i<sm_p->msgarray_count; i++) {                  \
        PINT_flow_clear(&(sm_p->u.io.contexts[i].flow_desc)); \
    }                                                        \
    if (sm_p->msgarray &&                                    \
        (sm_p->msgarray != &sm_p->msgpair))                  \
    {                                                        \
        free(sm_p->msgarray);                                \
        sm_p->msgarray = NULL;                               \
        sm_p->msgarray_count = 0;                            \
    }                                                        \
    if (sm_p->u.io.contexts)                                 \
    {                                                        \
        free(sm_p->u.io.contexts);                           \
        sm_p->u.io.contexts = NULL;                          \
    }                                                        \
} while(0)

static union PINT_state_array_values ST_init[];
static union PINT_state_array_values ST_io_getattr[];
static union PINT_state_array_values ST_io_datafile_setup_msgpairs[];
static union PINT_state_array_values ST_io_datafile_post_msgpairs[];
static union PINT_state_array_values ST_io_datafile_post_msgpairs_retry[];
static union PINT_state_array_values ST_io_datafile_complete_operations[];
static union PINT_state_array_values ST_io_analyze_results[];
static union PINT_state_array_values ST_io_datafile_size[];
static union PINT_state_array_values ST_io_analyze_size_results[];
static union PINT_state_array_values ST_io_cleanup[];

struct PINT_state_machine_s pvfs2_client_io_sm =
{
	ST_init,
	"pvfs2_client_io_sm"
};
static union PINT_state_array_values ST_init[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_init,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_getattr
};

static union PINT_state_array_values ST_io_getattr[] = {
(union PINT_state_array_values) 6,
(union PINT_state_array_values) &pvfs2_client_getattr_sm,
(union PINT_state_array_values) 0,
(union PINT_state_array_values) ST_io_datafile_setup_msgpairs,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_cleanup
};

static union PINT_state_array_values ST_io_datafile_setup_msgpairs[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_datafile_setup_msgpairs,
(union PINT_state_array_values) IO_NO_DATA,
(union PINT_state_array_values) ST_io_cleanup,
(union PINT_state_array_values) 0,
(union PINT_state_array_values) ST_io_datafile_post_msgpairs,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_cleanup
};

static union PINT_state_array_values ST_io_datafile_post_msgpairs[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_datafile_post_msgpairs,
(union PINT_state_array_values) IO_RETRY,
(union PINT_state_array_values) ST_io_datafile_post_msgpairs_retry,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_datafile_complete_operations
};

static union PINT_state_array_values ST_io_datafile_post_msgpairs_retry[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_datafile_post_msgpairs_retry,
(union PINT_state_array_values) IO_DATAFILE_TRANSFERS_COMPLETE,
(union PINT_state_array_values) ST_io_analyze_results,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_datafile_post_msgpairs
};

static union PINT_state_array_values ST_io_datafile_complete_operations[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_datafile_complete_operations,
(union PINT_state_array_values) IO_DATAFILE_TRANSFERS_COMPLETE,
(union PINT_state_array_values) ST_io_analyze_results,
(union PINT_state_array_values) IO_RETRY,
(union PINT_state_array_values) ST_io_datafile_post_msgpairs_retry,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_datafile_complete_operations
};

static union PINT_state_array_values ST_io_analyze_results[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_analyze_results,
(union PINT_state_array_values) IO_RETRY,
(union PINT_state_array_values) ST_init,
(union PINT_state_array_values) IO_ANALYZE_SIZE_RESULTS,
(union PINT_state_array_values) ST_io_analyze_size_results,
(union PINT_state_array_values) IO_GET_DATAFILE_SIZE,
(union PINT_state_array_values) ST_io_datafile_size,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_cleanup
};

static union PINT_state_array_values ST_io_datafile_size[] = {
(union PINT_state_array_values) 6,
(union PINT_state_array_values) &pvfs2_client_getattr_sm,
(union PINT_state_array_values) 0,
(union PINT_state_array_values) ST_io_analyze_size_results,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_cleanup
};

static union PINT_state_array_values ST_io_analyze_size_results[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_analyze_size_results,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) ST_io_cleanup
};

static union PINT_state_array_values ST_io_cleanup[] = {
(union PINT_state_array_values) 0,
(union PINT_state_array_values) io_cleanup,
(union PINT_state_array_values) -1,
(union PINT_state_array_values) 7
};

# 205 "src/client/sysint/sys-io.sm"


/** Initiate a read or write operation.
 *
 *  \param type specifies if the operation is a read or write.
 */
PVFS_error PVFS_isys_io(
    PVFS_object_ref ref,
    PVFS_Request file_req,
    PVFS_offset file_req_offset,
    void *buffer,
    PVFS_Request mem_req,
    PVFS_credentials *credentials,
    PVFS_sysresp_io *resp_p,
    enum PVFS_io_type io_type,
    PVFS_sys_op_id *op_id,
    void *user_ptr)
{
    PVFS_error ret = -PVFS_EINVAL;
    PINT_client_sm *sm_p = NULL;
    struct filesystem_configuration_s* cur_fs = NULL;
    struct server_configuration_s *server_config = NULL;

    gossip_debug(GOSSIP_CLIENT_DEBUG, "PVFS_isys_io entered [%Lu]\n",
                 Lu(ref.handle));

    if ((ref.handle == PVFS_HANDLE_NULL) ||
        (ref.fs_id == PVFS_FS_ID_NULL) || (resp_p == NULL))
    {
        gossip_err("invalid (NULL) required argument\n");
        return ret;
    }

    if ((io_type != PVFS_IO_READ) && (io_type != PVFS_IO_WRITE))
    {
        gossip_err("invalid (unknown) I/O type specified\n");
        return ret;
    }

    server_config = PINT_get_server_config_struct(ref.fs_id);
    cur_fs = PINT_config_find_fs_id(server_config, ref.fs_id);
    PINT_put_server_config_struct(server_config);

    if (!cur_fs)
    {
        gossip_err("invalid (unknown) fs id specified\n");
        return ret;
    }

    /* look for zero byte operations */
    if ((PINT_REQUEST_TOTAL_BYTES(mem_req) == 0) ||
        (PINT_REQUEST_TOTAL_BYTES(file_req) == 0))
    {
        gossip_ldebug(GOSSIP_IO_DEBUG, "Warning: 0 byte I/O operation "
                      "attempted.\n");
        resp_p->total_completed = 0;
        return 0;
    }

    sm_p = (PINT_client_sm *)malloc(sizeof(*sm_p));
    if (sm_p == NULL)
    {
        return -PVFS_ENOMEM;
    }
    memset(sm_p, 0, sizeof(*sm_p));

    PINT_init_msgarray_params(&sm_p->msgarray_params, ref.fs_id);
    PINT_init_sysint_credentials(sm_p->cred_p, credentials);
    sm_p->u.io.io_type = io_type;
    sm_p->u.io.file_req = file_req;
    sm_p->u.io.file_req_offset = file_req_offset;
    sm_p->u.io.io_resp_p = resp_p;
    sm_p->u.io.mem_req = mem_req;
    sm_p->u.io.buffer = buffer; 
    sm_p->u.io.flowproto_type = cur_fs->flowproto;
    sm_p->u.io.encoding = cur_fs->encoding;
    sm_p->u.io.stored_error_code = 0;
    sm_p->u.io.retry_count = 0;
    sm_p->msgarray = NULL;
    sm_p->u.io.datafile_index_array = NULL;
    sm_p->u.io.datafile_count = 0;
    sm_p->u.io.total_size = 0;
    sm_p->u.io.continue_analysis = 0;
    sm_p->u.io.saved_ret = 0;
    sm_p->u.io.saved_error_code = 0;
    sm_p->object_ref = ref;

    return PINT_client_state_machine_post(
        sm_p, PVFS_SYS_IO, op_id, user_ptr);
}

/** Perform a read or write operation.
 *
 *  \param type specifies if the operation is a read or write.
 */
PVFS_error PVFS_sys_io(
    PVFS_object_ref ref,
    PVFS_Request file_req,
    PVFS_offset file_req_offset,
    void *buffer,
    PVFS_Request mem_req,
    PVFS_credentials *credentials,
    PVFS_sysresp_io *resp_p,
    enum PVFS_io_type io_type)
{
    PVFS_error ret = -PVFS_EINVAL, error = 0;
    PVFS_sys_op_id op_id;

    gossip_debug(GOSSIP_CLIENT_DEBUG, "PVFS_sys_io entered\n");

    ret = PVFS_isys_io(ref, file_req, file_req_offset, buffer, mem_req,
                       credentials, resp_p, io_type, &op_id, NULL);
    if (ret)
    {
        PVFS_perror_gossip("PVFS_isys_io call", ret);
        error = ret;
    }
    else
    {
        ret = PINT_sys_wait(op_id, "io", &error);
        if (ret)
        {
            PVFS_perror_gossip("PVFS_sys_wait call", ret);
            error = ret;
        }
    }

    PINT_sys_release(op_id);
    return error;
}

/*******************************************************************/

static int io_init(PINT_client_sm *sm_p,
                   job_status_s *js_p)
{
    job_id_t tmp_id;

    gossip_debug(GOSSIP_CLIENT_DEBUG, "(%p) io state: io_init\n", sm_p);

    assert((js_p->error_code == 0) ||
           (js_p->error_code == IO_RETRY));

    PINT_SM_GETATTR_STATE_FILL(
        sm_p->getattr,
        sm_p->object_ref,
        PVFS_ATTR_META_ALL|PVFS_ATTR_COMMON_TYPE, 
        PVFS_TYPE_METAFILE);
       
    if (js_p->error_code == IO_RETRY)
    {
        js_p->error_code = 0;

        CLEAN_PRIVATE_MEMBERS(sm_p);

        if (sm_p->op_cancelled)
        {
            js_p->error_code = -PVFS_ECANCEL;
            return 1;
        }

        return job_req_sched_post_timer(
            sm_p->msgarray_params.retry_delay, sm_p, 0, js_p, &tmp_id,
            pint_client_sm_context);
    }
    return 1;
}

static int io_datafile_setup_msgpairs(PINT_client_sm *sm_p,
                                      job_status_s *js_p)
{
    int ret = -PVFS_EINVAL, i = 0;
    PVFS_object_attr *attr = NULL;
    int target_datafile_count = 0;

    gossip_debug(GOSSIP_CLIENT_DEBUG, "(%p) io state: "
                 "io_datafile_setup_msgpairs\n", sm_p);

    if (sm_p->op_cancelled)
    {
        js_p->error_code = -PVFS_ECANCEL;
        return 1;
    }

    js_p->error_code = 0;

    attr = &sm_p->getattr.attr;
    assert(attr);

    switch(attr->objtype)
    {
        case PVFS_TYPE_METAFILE:
            assert(attr->mask & PVFS_ATTR_META_DFILES);
            assert(attr->mask & PVFS_ATTR_META_DIST);
            assert(attr->u.meta.dist_size > 0);
            assert(attr->u.meta.dfile_array);
            assert(attr->u.meta.dfile_count > 0);
            break;
        case PVFS_TYPE_DIRECTORY:
            js_p->error_code = -PVFS_EISDIR;
            return 1;
        default:
            js_p->error_code = -PVFS_EBADF;
            return 1;
    }

    ret = PINT_dist_lookup(attr->u.meta.dist);
    if (ret)
    {
        PVFS_perror_gossip("PINT_dist_lookup failed; aborting I/O", ret);
        js_p->error_code = -PVFS_EBADF;
        return 1;
    }

    /* if we're doing a read, we zero the entire output buffer according
     * to the memory request.  This way we don't have to zero out the
     * holes one at a time.  We wait to perform zeroing until we have the
     * distribution info.
     */
    if(sm_p->u.io.io_type == PVFS_IO_READ)
    {
        PINT_Request_state * req_state;
        PINT_request_file_data rfdata;
        PINT_Request_result result;
        PVFS_offset offset;
        PVFS_size size;

        req_state = PINT_new_request_state(sm_p->u.io.mem_req);
        rfdata.server_ct = 1;
        rfdata.fsize = 0;
        rfdata.dist = attr->u.meta.dist;
        rfdata.extend_flag = 0;

        result.offset_array = &offset;
        result.size_array = &size;
        result.segmax = 1;
        result.bytemax = PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req);

        do
        {
            result.segs = 0;
            result.bytes = 0;

            ret = PINT_process_request(req_state, NULL, 
                                       &rfdata, &result, PINT_MEMREQ);
            if(ret < 0)
            {
                PVFS_perror_gossip("PINT_process_request failed;"
                                   "aborting I/O", ret);
                js_p->error_code = -PVFS_EBADF;
                return 1;
            }

            if(result.segs > 0)
            {
                memset(((char *)sm_p->u.io.buffer) + offset, 0, size);
            }
        } while(req_state->lvl > -1 && result.segs > 0);

        PINT_free_request_state(req_state);
    }

    sm_p->u.io.datafile_index_array = (int *)malloc(
        (attr->u.meta.dfile_count * sizeof(int)));
    if (!sm_p->u.io.datafile_index_array)
    {
        goto malloc_error_exit;
    }
    memset(sm_p->u.io.datafile_index_array, 0,
           (attr->u.meta.dfile_count * sizeof(int)));

    ret = io_find_target_datafiles(
        sm_p->u.io.mem_req,
        sm_p->u.io.file_req,
        sm_p->u.io.file_req_offset,
        attr->u.meta.dist,
        attr->u.meta.dfile_array,
        attr->u.meta.dfile_count,
        sm_p->u.io.datafile_index_array,
        &target_datafile_count);

    assert(ret == 0);

    if (target_datafile_count == 0)
    {
        free(sm_p->u.io.datafile_index_array);
        sm_p->u.io.datafile_index_array = NULL;

        gossip_debug(GOSSIP_IO_DEBUG, "  datafile_setup_msgpairs: no "
                     "datafiles have data; aborting\n");

        js_p->error_code = IO_NO_DATA;
        return 1;
    }

    gossip_debug(GOSSIP_IO_DEBUG,
                 "  %s: %d datafiles "
                 "might have data\n", __func__, target_datafile_count);

    sm_p->u.io.contexts = (PINT_client_io_ctx *)malloc(
        (target_datafile_count * sizeof(PINT_client_io_ctx)));
    if (!sm_p->u.io.contexts)
    {
        goto malloc_error_exit;
    }
    memset(sm_p->u.io.contexts, 0,
           (target_datafile_count * sizeof(PINT_client_io_ctx)));

    sm_p->msgarray_count = target_datafile_count;
    sm_p->msgarray = (PINT_sm_msgpair_state *)malloc(
        (sm_p->msgarray_count * sizeof(PINT_sm_msgpair_state)));
    if (!sm_p->msgarray)
    {
        goto malloc_error_exit;
    }
    memset(sm_p->msgarray, 0, (sm_p->msgarray_count *
                               sizeof(PINT_sm_msgpair_state)));

    sm_p->u.io.total_cancellations_remaining = 0;

    /* initialize all per server I/O operation contexts and requests */
    for(i = 0; i < target_datafile_count; i++)
    {
        PINT_client_io_ctx *cur_ctx = &sm_p->u.io.contexts[i];
        PINT_sm_msgpair_state *msg = &sm_p->msgarray[i];

        assert(msg && cur_ctx);
        
        memset(cur_ctx, 0, sizeof(PINT_client_io_ctx));
        memset(msg, 0, sizeof(PINT_sm_msgpair_state));

        gossip_debug(GOSSIP_IO_DEBUG, "initializing context[%d] %p\n",
                     i, cur_ctx);

        cur_ctx->msg = msg;
        cur_ctx->index = i;
        cur_ctx->server_nr = sm_p->u.io.datafile_index_array[i];
        cur_ctx->data_handle =
            attr->u.meta.dfile_array[cur_ctx->server_nr];

        PINT_flow_reset(&cur_ctx->flow_desc);

        gossip_debug(GOSSIP_IO_DEBUG, "  filling I/O request "
                     "for %Lu\n", Lu(cur_ctx->data_handle));

        PINT_SERVREQ_IO_FILL(
            msg->req,
            *sm_p->cred_p,
            sm_p->object_ref.fs_id,
            cur_ctx->data_handle,
            sm_p->u.io.io_type,
            sm_p->u.io.flowproto_type,
            sm_p->u.io.datafile_index_array[i],
            attr->u.meta.dfile_count,
            attr->u.meta.dist,
            sm_p->u.io.file_req,
            sm_p->u.io.file_req_offset,
            PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req));

        msg->fs_id = sm_p->object_ref.fs_id;
        msg->handle = cur_ctx->data_handle;
        msg->retry_flag = PVFS_MSGPAIR_NO_RETRY;
        msg->comp_fn = NULL;

        ret = PINT_cached_config_map_to_server(
               &msg->svr_addr, msg->handle, msg->fs_id);
        if (ret)
        {
            gossip_err("Failed to map meta server address\n");
            js_p->error_code = ret;
            return 1;
        }
    }

    sm_p->u.io.datafile_count = target_datafile_count;

    js_p->error_code = 0;
    return 1;

  malloc_error_exit:
    CLEAN_PRIVATE_MEMBERS(sm_p);

    js_p->error_code = -PVFS_ENOMEM;
    return 1;
}

/*
  This is based on msgpairarray_post() in msgpairarray.c.  It's
  different enough in that we don't have to wait on the msgpairarray
  operations to all complete before posting flows as we can do so for each
  server individually when we're ready.  this avoids the msgpairarray
  sync point implicit in the design
*/
static int io_datafile_post_msgpairs(PINT_client_sm *sm_p,
                                     job_status_s *js_p)
{
    int ret = -PVFS_EINVAL, i = 0;
    unsigned long status_user_tag = 0;
    int must_loop_encodings = 0;
    struct server_configuration_s *server_config = NULL;

    gossip_debug(GOSSIP_CLIENT_DEBUG, "io_datafile_post_msgpairs "
                 "state: post (%d message(s))\n", sm_p->msgarray_count);

    if (sm_p->op_cancelled)
    {
        js_p->error_code = -PVFS_ECANCEL;
        return 1;
    }

    js_p->error_code = 0;

    assert(sm_p->msgarray);
    assert(sm_p->msgarray_count == sm_p->u.io.datafile_count);

    /* completion count tracks sends/recvs separately, will increment
     * as we go through the loop to maintain a count of outstanding msgpairs */
    sm_p->u.io.msgpair_completion_count = 0;

    for(i = 0; i < sm_p->u.io.datafile_count; i++)
    {
        PINT_client_io_ctx *cur_ctx = &sm_p->u.io.contexts[i];
        PINT_sm_msgpair_state *msg = &sm_p->msgarray[i];

        assert(cur_ctx && msg);
        assert(cur_ctx->msg == msg);

        /* do not do this one again in retry case */
        if (cur_ctx->msg_recv_has_been_posted) {
            if (cur_ctx->msg_recv_in_progress)
                ++sm_p->u.io.msgpair_completion_count;
            goto recv_already_posted;
        }

        if (!ENCODING_IS_VALID(sm_p->u.io.encoding))
        {
            PRINT_ENCODING_ERROR("supported", sm_p->u.io.encoding);
            must_loop_encodings = 1;
            sm_p->u.io.encoding = (ENCODING_INVALID_MIN + 1);
        }
        else if (!ENCODING_IS_SUPPORTED(sm_p->u.io.encoding))
        {
            PRINT_ENCODING_ERROR("supported", sm_p->u.io.encoding);
            must_loop_encodings = 1;
            sm_p->u.io.encoding = ENCODING_SUPPORTED_MIN;
        }

      try_next_encoding:
        assert(ENCODING_IS_VALID(sm_p->u.io.encoding));

        ret = PINT_encode(&msg->req, PINT_ENCODE_REQ, &msg->encoded_req,
                          msg->svr_addr, sm_p->u.io.encoding);
        if (ret)
        {
            if (must_loop_encodings)
            {
                gossip_debug(GOSSIP_CLIENT_DEBUG, "Looping through "
                             "encodings [%d/%d]\n", sm_p->u.io.encoding,
                             ENCODING_INVALID_MAX);

                sm_p->u.io.encoding++;
                if (ENCODING_IS_VALID(sm_p->u.io.encoding))
                {
                    goto try_next_encoding;
                }
            }
            /*
              FIXME: make this a clean error transition by adjusting
              the completion count and/or (not) exiting
            */
            PVFS_perror_gossip("PINT_encode failed", ret);
            js_p->error_code = ret;
            return 1;
        }

        /* calculate maximum response message size and allocate it */
        msg->max_resp_sz = PINT_encode_calc_max_size(
            PINT_ENCODE_RESP, msg->req.op, sm_p->u.io.encoding);
        msg->encoded_resp_p = BMI_memalloc(
            msg->svr_addr, msg->max_resp_sz, BMI_RECV);
        if (!msg->encoded_resp_p)
        {
            /* FIXME: see above FIXME */
            js_p->error_code = -PVFS_ENOMEM;
            return 1;
        }

        /*
          recalculate the status user tag based on this the progress
          of the current context like this: status_user_tag = (4 *
          (context index) + context phase)
        */
        assert(cur_ctx->index == i);
        status_user_tag = ((4 * i) + IO_SM_PHASE_REQ_MSGPAIR_RECV);

        gossip_debug(GOSSIP_IO_DEBUG," posting recv with "
                     "status_user_tag=%lu (max_size %d)\n",
                     status_user_tag, msg->max_resp_sz);

        cur_ctx->session_tag = PINT_util_get_next_tag();

        cur_ctx->msg_recv_has_been_posted = 0;
        cur_ctx->msg_recv_in_progress = 0;

        server_config = PINT_get_server_config_struct(sm_p->object_ref.fs_id);
        ret = job_bmi_recv(
            msg->svr_addr, msg->encoded_resp_p, msg->max_resp_sz,
            cur_ctx->session_tag, BMI_PRE_ALLOC, sm_p, status_user_tag,
            &msg->recv_status, &msg->recv_id, pint_client_sm_context,
            server_config->client_job_bmi_timeout);
        PINT_put_server_config_struct(server_config);

        /* ret -1: problem, do not look at msg recv_status */
        /* ret 1: immediate completion, see status */
        /* ret 0: okay */

        if (ret < 0) {
            PVFS_perror_gossip("Post of receive failed", ret);
            js_p->error_code = ret;
            continue;

        }

        if (ret == 0) {
            int tmp = 0;
            /* perform a quick test to see if the recv failed before
             * posting the send; if it reports an error quickly then
             * we can save the confusion of sending a request for
             * which we can't recv a response
             */
            ret = job_test(msg->recv_id, &tmp, NULL,
                           &msg->recv_status, 0,
                           pint_client_sm_context);
            if (ret < 0) {
                PVFS_perror_gossip("Post of receive failed", ret);
                js_p->error_code = ret;
                continue;
            }
        }

        /* either from job_bmi_recv or from job_test finding something */
        if (ret == 1) {
            /*
             * This recv must have completed with an error because the
             * server has not yet been sent our request.
             */
            PVFS_perror_gossip("Receive immediately failed",
                               msg->recv_status.error_code);

            ret = msg->recv_status.error_code;
            js_p->error_code = ret;
            continue;
        }

        cur_ctx->msg_recv_has_been_posted = 1;
        cur_ctx->msg_recv_in_progress = 1;

        /* posted the receive okay */
        ++sm_p->u.io.msgpair_completion_count;

      recv_already_posted:

        if (cur_ctx->msg_send_has_been_posted) {
            if (cur_ctx->msg_send_in_progress)
                ++sm_p->u.io.msgpair_completion_count;
            continue;
        }

        status_user_tag = ((4 * i) + IO_SM_PHASE_REQ_MSGPAIR_SEND);

        cur_ctx->msg_send_has_been_posted = 0;
        cur_ctx->msg_send_in_progress = 0;

        gossip_debug(GOSSIP_IO_DEBUG," posting send with "
                     "status_user_tag=%lu\n", status_user_tag);

        server_config = PINT_get_server_config_struct(sm_p->object_ref.fs_id);
        ret = job_bmi_send_list(
            msg->encoded_req.dest, msg->encoded_req.buffer_list,
            msg->encoded_req.size_list, msg->encoded_req.list_count,
            msg->encoded_req.total_size, cur_ctx->session_tag,
            msg->encoded_req.buffer_type, 1, sm_p, status_user_tag,
            &msg->send_status, &msg->send_id, pint_client_sm_context,
            server_config->client_job_bmi_timeout);
        PINT_put_server_config_struct(server_config);

        if (ret < 0) {
            PVFS_perror_gossip("Post of send failed, cancelling recv", ret);
            msg->op_status = msg->send_status.error_code;
            msg->send_id = 0;
            job_bmi_cancel(msg->recv_id, pint_client_sm_context);

            js_p->error_code = ret;
            continue;
        }

        if (ret == 1) {
            if (msg->send_status.error_code == 0) {
                gossip_debug(GOSSIP_IO_DEBUG, "  io_datafile_post_msgpairs: "
                    "send completed immediately.\n");

                /* 0 is the valid "completed job id" value */
                cur_ctx->msg_send_has_been_posted = 1;
                msg->send_id = 0;

            } else {
                PVFS_perror_gossip("Send immediately failed, cancelling recv",
                    msg->recv_status.error_code);

                msg->op_status = msg->send_status.error_code;
                msg->send_id = 0;

                /* still wait for the recv to complete */
                job_bmi_cancel(msg->recv_id, pint_client_sm_context);

                js_p->error_code = msg->send_status.error_code;
                continue;
            }
        } else {
            /* posted the send */
            cur_ctx->msg_send_in_progress = 1;
            cur_ctx->msg_send_has_been_posted = 1;
            ++sm_p->u.io.msgpair_completion_count;
        }
    }

    gossip_debug(GOSSIP_IO_DEBUG, "io_datafile_post_msgpairs: "
                 "completion count is %d\n",
                 sm_p->u.io.msgpair_completion_count);

    /* if anything posted, just wait for that to complete, else
     * go sleep then try the remaining msgpairs again */
    if (sm_p->u.io.msgpair_completion_count
     || sm_p->u.io.flow_completion_count
     || sm_p->u.io.write_ack_completion_count)
        return 0;  /* means go find another machine to run */
    else {
        js_p->error_code = IO_RETRY;
        return 1;  /* means look at error_code and run my machine again */
    }
}

/*
 * For IO retry, come here to sleep a bit then go back and post
 * some more msgpairs.
 */
static int
io_datafile_post_msgpairs_retry(PINT_client_sm *sm_p, job_status_s *js_p)
{
    /* give up if beyond retry limit */
    ++sm_p->u.io.retry_count;
    if (sm_p->u.io.retry_count > sm_p->msgarray_params.retry_limit) {
        gossip_debug(GOSSIP_CLIENT_DEBUG, "%s: retry %d exceeds limit %d\n",
          __func__, sm_p->u.io.retry_count, sm_p->msgarray_params.retry_delay);
        js_p->error_code = IO_DATAFILE_TRANSFERS_COMPLETE;
        return 1;
    }

    gossip_debug(GOSSIP_CLIENT_DEBUG, "%s: retry %d, wait %d ms\n", __func__,
      sm_p->u.io.retry_count, sm_p->msgarray_params.retry_delay);

    return job_req_sched_post_timer(sm_p->msgarray_params.retry_delay,
        sm_p, 0, js_p, NULL, pint_client_sm_context);
}

/*
  This state allows us to make sure all posted operations complete and
  are accounted for.  since this handles ALL operation completions,
  there's special case handling of completing the msgpair recv.  in
  this case we post the flow operations as soon as we see them (the
  main motivation for not using the common msgpairarray code).
*/
static int io_datafile_complete_operations(PINT_client_sm *sm_p,
                                           job_status_s *js_p)
{
    int ret = -PVFS_EINVAL, index = 0, i;
    unsigned long status_user_tag = (unsigned long)
        js_p->status_user_tag;
    PINT_client_io_ctx *cur_ctx = NULL;
    int matched_send_or_recv = 0;
    struct server_configuration_s *server_config = NULL;

    gossip_debug(
        GOSSIP_CLIENT_DEBUG, "(%p) io_datafile_complete_operations "
        "(tag %lu)\n", sm_p, status_user_tag);

    assert(sm_p->msgarray_count == sm_p->u.io.datafile_count);
    assert(sm_p->u.io.msgpair_completion_count > -1);
    assert(sm_p->u.io.flow_completion_count > -1);
    assert(sm_p->u.io.write_ack_completion_count > -1);

    /* check if we're completing a send or recv msgpair */
    if (STATUS_USER_TAG_IS_SEND_OR_RECV(status_user_tag))
    {
        /*
         * The completion count might validly be zero when recovering from
         * a cancellation.
         */
        if (sm_p->u.io.msgpair_completion_count)
        {
            ret = complete_context_send_or_recv(sm_p, js_p);
            if (ret < 0) {
                /* problem */
                PVFS_perror_gossip(
                    "complete_context_send_or_recv failed", ret);
                js_p->error_code = ret;
                return 1;
            } else if (ret == 0) {
                /* is a send */
                gossip_debug(GOSSIP_IO_DEBUG, "  matched send in context "
                             "%d; continuing.\n", index);
                js_p->error_code = 0;
                /* If send had problem, BMI will apparently ensure that the
                 * recv will fail too, so handle the retry stuff there.
                 */
                return 0;
            } else {
                /* is a recv */
                assert(ret == IO_RECV_COMPLETED);
                matched_send_or_recv = 1;
            }
        }
    }

    /* if we've just completed a recv above, post the flow here */
    if (ret == IO_RECV_COMPLETED)
    {
        ret = process_context_recv_and_post_flow(sm_p, js_p, &cur_ctx);
        if (ret < 0)
        {
            char buf[64] = {0};
            PVFS_strerror_r(ret, buf, 64);

            gossip_debug(GOSSIP_IO_DEBUG,
              "%s: process_context_recv_and_post_flow failed: "
              "%s (%d remaining msgpairs)\n",
              __func__, buf, sm_p->u.io.msgpair_completion_count);

            js_p->error_code = ret;
            /* if recv failed, probably have to do the send again too */
            cur_ctx->msg_send_has_been_posted = 0;
            cur_ctx->msg_recv_has_been_posted = 0;
            goto check_next_step;
        }
    }

    /* check if we've completed all msgpairs and posted all flows */
    if (matched_send_or_recv)
    {
        if (sm_p->u.io.msgpair_completion_count == 0)
        {
            gossip_debug(GOSSIP_IO_DEBUG, "*** all msgpairs complete "
                         "(all flows posted)\n");
        }
        else
        {
            gossip_debug(
                GOSSIP_IO_DEBUG, "*** %d msgpair completions "
                "pending\n", sm_p->u.io.msgpair_completion_count);
        }
        return 0;
    }

    /* at this point, we're either completing a flow or a write ack */
    if (STATUS_USER_TAG_TYPE(status_user_tag, IO_SM_PHASE_FLOW))
    {
        assert(sm_p->u.io.flow_completion_count);

        index = STATUS_USER_TAG_GET_INDEX(
            status_user_tag, IO_SM_PHASE_FLOW);
        cur_ctx = &sm_p->u.io.contexts[index];
        assert(cur_ctx);

        cur_ctx->flow_status = *js_p;

        if (cur_ctx->write_ack_in_progress)
        {
            int ret = 0;

            assert(sm_p->u.io.write_ack_completion_count);
            server_config = PINT_get_server_config_struct(sm_p->object_ref.fs_id);
            ret = job_reset_timeout(cur_ctx->write_ack.recv_id,
                server_config->client_job_bmi_timeout);
            PINT_put_server_config_struct(server_config);

            /*
              allow -PVFS_EINVAL errors in case the recv has already
              completed (before we've processed it)
            */
            assert((ret == 0) || (ret == -PVFS_EINVAL));
        }

        gossip_debug(GOSSIP_IO_DEBUG, "  matched completed flow for "
                     "context %p%s\n", cur_ctx,
                     ((cur_ctx->write_ack_in_progress ?
                       " and reset write_recv timeout" : "")));

        cur_ctx->flow_in_progress = 0;
        sm_p->u.io.flow_completion_count--;
        assert(sm_p->u.io.flow_completion_count > -1);

        /* if error, restart; but if this is a write, let write ack catch */
        if (js_p->error_code < 0 && !cur_ctx->write_ack_in_progress) {
            gossip_debug(GOSSIP_IO_DEBUG,
              "%s: flow failed, retrying from msgpair\n", __func__);
            cur_ctx->msg_send_has_been_posted = 0;
            cur_ctx->msg_recv_has_been_posted = 0;
        }
    }
    else if (STATUS_USER_TAG_TYPE(status_user_tag, IO_SM_PHASE_FINAL_ACK))
    {
        assert(sm_p->u.io.write_ack_completion_count);

        index = STATUS_USER_TAG_GET_INDEX(
            status_user_tag, IO_SM_PHASE_FINAL_ACK);
        cur_ctx = &sm_p->u.io.contexts[index];
        assert(cur_ctx);

        assert(cur_ctx->write_ack.recv_status.actual_size <=
               cur_ctx->write_ack.max_resp_sz);

        cur_ctx->write_ack.recv_id = 0;
        cur_ctx->write_ack.recv_status = *js_p;

        gossip_debug(GOSSIP_IO_DEBUG, "  matched completed ack for "
                     "context %p\n", cur_ctx);

        cur_ctx->write_ack_in_progress = 0;
        sm_p->u.io.write_ack_completion_count--;
        assert(sm_p->u.io.write_ack_completion_count > -1);

        if (js_p->error_code < 0) {
            gossip_debug(GOSSIP_IO_DEBUG,
              "%s: write-ack failed, retrying from msgpair\n", __func__);
            cur_ctx->msg_send_has_been_posted = 0;
            cur_ctx->msg_recv_has_been_posted = 0;
        }
    }

  check_next_step:

    /*
     * If something is pending, return 0 to let SM find the next thing
     * to do.
     */
    if (sm_p->u.io.msgpair_completion_count
     || sm_p->u.io.flow_completion_count
     || sm_p->u.io.write_ack_completion_count) {
        if (sm_p->op_cancelled)
            gossip_debug(GOSSIP_IO_DEBUG, "detected I/O cancellation with "
                         "%d flows and %d write acks pending\n",
                         sm_p->u.io.flow_completion_count,
                         sm_p->u.io.write_ack_completion_count);
        else
            gossip_debug(GOSSIP_IO_DEBUG, " %d flows pending, %d write acks "
                         "pending, %d msgpair\n", sm_p->u.io.flow_completion_count,
                         sm_p->u.io.write_ack_completion_count,
                         sm_p->u.io.msgpair_completion_count);
        return 0;
    }

    /*
     * Else either we've finished it all or have some msgpairs to retry
     * that failed earlier.
     */
    for (i=0; i < sm_p->u.io.datafile_count; i++) {
        PINT_client_io_ctx *cur_ctx = &sm_p->u.io.contexts[i];
        if (!cur_ctx->msg_recv_has_been_posted)
            break;
        if (!cur_ctx->msg_send_has_been_posted)
            break;
    }
    if (i < sm_p->u.io.datafile_count) {
        gossip_debug(GOSSIP_IO_DEBUG,
          "*** %s: some msgpairs to repost\n", __func__);
        js_p->error_code = IO_RETRY;
    } else {
        gossip_debug(GOSSIP_IO_DEBUG, "*** all operations %s "
                     "(msgpairs, flows, write acks)\n",
                     (sm_p->op_cancelled ? "cancelled" : "completed"));
        js_p->error_code = IO_DATAFILE_TRANSFERS_COMPLETE;
    }
    return 1;
}

static int io_analyze_results(PINT_client_sm *sm_p,
                              job_status_s *js_p)
{
    PINT_pinode *pinode;
    int ret = -PVFS_EINVAL, i = 0;
    PVFS_size tmp_size = 0;

    gossip_debug(GOSSIP_CLIENT_DEBUG, "(%p) io state: "
                 "io_analyze_results\n", sm_p);

    /* FIXME: right now we invalidate size from attribute in acache.  
     * When zero-fill stuff gets figured out we should remove this
     * and update the pinode->size
     *
     * Now we also use the pinode from the attribute cache for holding
     * the distribution info from the getattr.
     */
    pinode = PINT_acache_lookup(sm_p->object_ref, NULL, NULL);

    /* we assume pinode has to be cached from previous getattr */
    assert(pinode);
    pinode->attr.mask &= ~PVFS_ATTR_DATA_SIZE;

    if (js_p->error_code != IO_DATAFILE_TRANSFERS_COMPLETE)
    {
        ret = (sm_p->u.io.stored_error_code ?
               sm_p->u.io.stored_error_code :
               js_p->error_code);

        if (ret == 0)
        {
            ret = (sm_p->op_cancelled ? -PVFS_ECANCEL : -PVFS_EIO);
        }
    }
    else if (!sm_p->op_cancelled)
    {
        /*
          look through all the contexts for errors, saving the first
          one to return (if any) while adding up the size of the
          transfer (in case things actually completed).
        */
        assert(sm_p->msgarray_count == sm_p->u.io.datafile_count);
        for(i = 0; i < sm_p->u.io.datafile_count; i++)
        {
            PINT_client_io_ctx *cur_ctx = &sm_p->u.io.contexts[i];
            assert(cur_ctx);

            tmp_size = sm_p->u.io.total_size;

            ret = check_context_status(
                cur_ctx, sm_p->u.io.io_type, &sm_p->u.io.total_size);
            if (ret < 0)
            {
                if (ret == -PVFS_ECANCEL)
                {
                    gossip_debug(GOSSIP_IO_DEBUG, "*** I/O operation "
                                 "cancelled\n");
                }
                else
                {
                    PVFS_perror_gossip(
                        "check_context_status found error", ret);
                }
                break;
            }

            gossip_debug(
                GOSSIP_IO_DEBUG, "[%d/%d] running size is %Ld\n",
                (i + 1), sm_p->u.io.datafile_count,
                Ld(sm_p->u.io.total_size));
        }

        /*
          at this point, we may know an error occurred.  if we
          couldn't find any errors in the context, use the preserved
          error code from the complete_operations state (which may be
          success)
        */
        if (ret == 0)
        {
            char buf[64] = {0};

            ret = (sm_p->op_cancelled ? -PVFS_ECANCEL :
                   sm_p->u.io.stored_error_code);

            PVFS_strerror_r(ret, buf, 64);
            gossip_debug(GOSSIP_IO_DEBUG, "no context errors found; "
                         "using: %s\n", buf);
        }

    }
    else
    {
        ret = (sm_p->op_cancelled ? -PVFS_ECANCEL : -PVFS_EIO);
    }

    /* be sure there are no jobs still laying around */
    assert((sm_p->u.io.msgpair_completion_count == 0) &&
           (sm_p->u.io.flow_completion_count == 0) &&
           (sm_p->u.io.write_ack_completion_count == 0));

    /*
      FIXME: non bmi errors pop out in flow failures above -- they are
      not properly marked as flow errors either, so we check for them
      explicitly here (but not all -- fix it for real).
    */
    if (((PVFS_ERROR_CLASS(-ret) == PVFS_ERROR_BMI) ||
         (PVFS_ERROR_CLASS(-ret) == PVFS_ERROR_FLOW) ||
         (ret == -ECONNRESET) || (ret == -PVFS_EPROTO)) &&
        (sm_p->u.io.retry_count < sm_p->msgarray_params.retry_limit))
    {
        PINT_acache_invalidate(sm_p->object_ref);

        assert(!sm_p->op_cancelled);

        sm_p->u.io.stored_error_code = 0;
        sm_p->u.io.total_size = 0;  /* start from the beginning again */
        sm_p->u.io.retry_count++;

        gossip_debug(GOSSIP_IO_DEBUG, "Retrying I/O operation "
                     "(attempt number %d)\n", sm_p->u.io.retry_count);

        js_p->error_code = IO_RETRY;
        goto analyze_results_exit;
    }

    gossip_debug(GOSSIP_IO_DEBUG, "total bytes transferred is %Ld\n",
                 Ld(sm_p->u.io.total_size));

    if(sm_p->u.io.io_type == PVFS_IO_WRITE)
    {
        js_p->error_code = 0;
        sm_p->u.io.io_resp_p->total_completed = sm_p->u.io.total_size;

        /* we can skip the check for holes since its only needed in the
         * case of reads
         */
        goto analyze_results_exit;
    }

    /* In order to give the sysint caller the correct value for length
     * of bytes read, we have to check for holes in the logical file.  
     * The algorithm is as follows:
     *
     * 1. If the size of the memory request is equivalent to the number of
     * bytes read, we know there are no holes and the total size
     * is the correct value to return back to the caller.
     *
     * 2. If 1. is false, either there's a hole in the file within the
     * region of the file request, or the request is past EOF.  If the request
     * is NOT past EOF, then the return value for length of bytes read
     * is equivalent to the size of the memory request.  To check that the
     * request is not past EOF, we iterate through the target datafiles,
     * calculate the logical file offsets of each based on the their physical
     * bstream size, looking for a logical offset that is >= the upper bound
     * of the memory request.  If we find a target datafile that matches this
     * criteria, we know that the request is not past EOF, and that
     * returned bytes read is equivalent to the size of the memory request.
     *
     * 3. If none of the target datafile logical offsets are >= the upper
     * bound of the file request, we still must check all the datafiles
     * that are beyond the last target datafile.  To do this we have to
     * go and get the sizes of each one, and perform the above comparison
     * on them as well.  Again, if one of their logical offsets >= the upper
     * bound of the file request, we know the returned bytes read is the
     * size of the file request.  
     *
     * 4. If we don't find any datafiles with 
     * logical offset >= the upper bound of the file request, the returned
     * bytes read value is the size of the file request minus the last offset
     * of the datafiles (where the EOF occurs).
     */
    if(sm_p->u.io.total_size == PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req))
    {
        sm_p->u.io.io_resp_p->total_completed = sm_p->u.io.total_size;
    }
    else
    {
        int i = 0;
        PINT_request_file_data file_data;
        PVFS_size memreq_ub_offset;
        
        /* compute the upper bound of the memory request.  This is the
         * logical offset used to compare against all the datafile
         * logical offsets
         */
        PVFS_Request_ub(sm_p->u.io.mem_req, &memreq_ub_offset);
        memreq_ub_offset += sm_p->u.io.file_req_offset;

        memset(&file_data, 0, sizeof(file_data));
        file_data.dist = pinode->attr.u.meta.dist;
        file_data.server_ct = pinode->attr.u.meta.dfile_count;
        file_data.extend_flag = 0;

        /* iterate through each of the target datafiles */
        for(i = 0; i < sm_p->u.io.datafile_count; ++i)
        {
            PVFS_offset datafile_logical_offset;
            file_data.server_nr = sm_p->u.io.datafile_index_array[i];

            datafile_logical_offset = 
                pinode->attr.u.meta.dist->methods->physical_to_logical_offset(
                    pinode->attr.u.meta.dist->params,
                    &file_data,
                    sm_p->u.io.contexts[i].flow_desc.file_data.fsize);

            if(datafile_logical_offset >= memreq_ub_offset)
            {
                /* we found a logical offset that is past the end of the
                 * request, so we know the request is not past EOF
                 */
                sm_p->u.io.io_resp_p->total_completed = 
                    PINT_REQUEST_TOTAL_BYTES(
                        sm_p->u.io.mem_req);
                break;
            }
        }

        /* if the above loop fails to find a datafile that matches, and
         * the target datafile count equals the total datafile count, we
         * can go straight to analyze_size_results.  Otherwise, we
         * need to get the rest of the datafiles.  At some point we should fix
         * this and the getattr state machine to allow us to only get
         * the remaining datafile sizes that we need, instead of 
         * getting all of them.  Right now the getattr state machine
         * just gets them all.
         */
        if(i == sm_p->u.io.datafile_count)
        {
            PINT_SM_GETATTR_STATE_CLEAR(sm_p->getattr);
            PINT_SM_DATAFILE_SIZE_ARRAY_INIT(
                &sm_p->u.io.dfile_size_array, 
                pinode->attr.u.meta.dfile_count);

            if(sm_p->u.io.datafile_count == pinode->attr.u.meta.dfile_count)
            {
                for(i = 0; i < pinode->attr.u.meta.dfile_count; ++i)
                {
                    sm_p->u.io.dfile_size_array[i] = 
                        sm_p->u.io.contexts[i].flow_desc.file_data.fsize;
                }

                /* we skip getting all the datafile sizes (since we already
                 * have them) and go straight to analyzing the sizes state.
                 */
                js_p->error_code = IO_ANALYZE_SIZE_RESULTS;
                goto analyze_results_exit;
            }

            PINT_SM_GETATTR_STATE_FILL(sm_p->getattr,
                                       sm_p->object_ref,
                                       PVFS_ATTR_DATA_SIZE,
                                       PVFS_TYPE_NONE);
            sm_p->getattr.size_array = sm_p->u.io.dfile_size_array;

            /* setting this state result will cause the state machine to
             * jump to getattr and get all the datafile sizes from all the
             * servers.  Once complete, we will then jump back into the
             * io state machine at the analyze_size_results state.
             */
            js_p->error_code = IO_GET_DATAFILE_SIZE;
            goto analyze_results_exit;
        }
    }

    js_p->error_code = ret;

analyze_results_exit:

    if(pinode)
    {
        PINT_acache_release(pinode);
    }

    return 1;
}

static int io_analyze_size_results(
    PINT_client_sm *sm_p, job_status_s *js_p)
{
    /* reset the msgarray_count to its previous value before
     * the getattr state machine was called.
     */
    sm_p->msgarray_count = sm_p->u.io.datafile_count;

    /* Now that we have all the datafile sizes, 
     * this state allows us to finish our check that the file request
     * is not beyond EOF, and return the appropriate value for bytes
     * read to the sysint caller.
     *
     * The check iterates through all the datafiles and compares
     * their logical sizes with the upper bound of the file request.
     * If one of the datafile's logical sizes is >= than the ub,
     * we know the request is not past EOF.  Otherwise it must be, and
     * the return value for bytes read is calculated from the size
     * of the file request and the greatest logical offset of 
     * the datafiles (the actual EOF).
     */
    
    int i = 0;
    PVFS_offset max_datafile_logical_offset = 0;
    PVFS_offset datafile_logical_offset;
    PVFS_offset memreq_ub_offset;
    PVFS_offset memreq_lb_offset;
    PINT_request_file_data file_data;
    PINT_pinode * pinode = PINT_acache_lookup(sm_p->object_ref, NULL, NULL); 

    PVFS_Request_ub(sm_p->u.io.mem_req, &memreq_ub_offset);
    memreq_ub_offset += sm_p->u.io.file_req_offset;

    PVFS_Request_lb(sm_p->u.io.file_req, &memreq_lb_offset);
    memreq_lb_offset += sm_p->u.io.file_req_offset;

    memset(&file_data, 0, sizeof(file_data));
    file_data.dist = pinode->attr.u.meta.dist;
    file_data.server_ct = pinode->attr.u.meta.dfile_count;
    file_data.extend_flag = 0;
 
    for(i = 0; i < pinode->attr.u.meta.dfile_count; ++i)
    {
        file_data.server_nr = i;

        datafile_logical_offset = 
            pinode->attr.u.meta.dist->methods->physical_to_logical_offset(
                pinode->attr.u.meta.dist->params,
                &file_data,
                sm_p->u.io.dfile_size_array[i]);

        if(datafile_logical_offset >= memreq_ub_offset)
        {
            /* we found a logical offset that is past the end of the
             * request, so we know the request is not past EOF
             */
            sm_p->u.io.io_resp_p->total_completed = 
                    PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req);
            break;
        }

        max_datafile_logical_offset = PVFS_util_max(
                datafile_logical_offset, max_datafile_logical_offset);
    }

    if(i == pinode->attr.u.meta.dfile_count)
    {
        /* At this point there are no datafiles that have a logical
         * offset past the upper bound of the file request, so we know that
         * the request is beyond the EOF of the file.  We compute
         * the return value for bytes read by finding the upper bound of the
         * memory request *within* the logical file (before EOF).  This is
         * the end of the contiguous segment in the file request < EOF.
         * The number of bytes read is then the length of the file request
         * from start to this point.
         */

        PINT_Request_state * filereq_state;
        PINT_Request_state * memreq_state;
        PINT_request_file_data rfdata;
        PINT_Request_result result;
        int res;
        PVFS_offset offset;
        PVFS_size size;
        PVFS_size total_size = 0;

        filereq_state = PINT_new_request_state(sm_p->u.io.file_req);
        memreq_state = PINT_new_request_state(sm_p->u.io.mem_req);

        rfdata.server_ct = 1;
        rfdata.fsize = max_datafile_logical_offset;
        rfdata.dist = pinode->attr.u.meta.dist;
        rfdata.extend_flag = 0;

        result.offset_array = &offset;
        result.size_array = &size;
        result.segmax = 1;
        result.bytemax = PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req); 

        PINT_REQUEST_STATE_SET_TARGET(filereq_state, memreq_lb_offset);
        PINT_REQUEST_STATE_SET_FINAL(
            filereq_state, max_datafile_logical_offset);

        do
        {
            PVFS_offset total_offset;
            result.segs = 0;
            result.bytes = 0;
            
            res = PINT_process_request(filereq_state, memreq_state,
                                       &rfdata, &result, PINT_CLIENT);
            if(res < 0)
            {
                js_p->error_code = res;
                goto error_exit;
            }

            total_offset = sm_p->u.io.file_req_offset +  offset;
            
            if(result.segs > 0)
            {
                if((max_datafile_logical_offset >= total_offset) &&
                   (max_datafile_logical_offset <= total_offset + size))
                {
                    total_size += (max_datafile_logical_offset - 
                                   total_offset);
                    break;
                }
                else if(max_datafile_logical_offset < total_offset)
                {
                    break;
                }
                else
                {
                    total_size += size;
                }
            }
        } while(!PINT_REQUEST_DONE(filereq_state) && result.segs);

        sm_p->u.io.io_resp_p->total_completed =
            PVFS_util_min(PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req),
                          total_size);
    }
    
    PINT_acache_release(pinode);

    js_p->error_code = 0;

error_exit:
    return 1;
}

static int io_cleanup(PINT_client_sm *sm_p,
                      job_status_s *js_p)
{
    gossip_debug(GOSSIP_CLIENT_DEBUG,
                 "(%p) io state: io_cleanup\n", sm_p);

    CLEAN_PRIVATE_MEMBERS(sm_p);

    PINT_SM_GETATTR_STATE_CLEAR(sm_p->getattr); 
    
    if(sm_p->u.io.dfile_size_array)
    {
        PINT_SM_DATAFILE_SIZE_ARRAY_DESTROY(&sm_p->u.io.dfile_size_array);
    }

    sm_p->error_code = js_p->error_code;

    if (sm_p->error_code)
    {
        char buf[64] = {0};

        PVFS_strerror_r(sm_p->error_code, buf, 64);
        gossip_debug(GOSSIP_IO_DEBUG,
                     "*** Final I/O operation error is %s\n", buf);
    }

    sm_p->op_complete = 1;
    return 0;
}

/********************************************************************/

/*
  returns 0 on send completion; IO_RECV_COMPLETED on recv completion,
  and -PVFS_error on failure
*/
static inline int complete_context_send_or_recv(
    PINT_client_sm *sm_p,
    job_status_s *js_p)
{
    int ret = -PVFS_EINVAL, index = 0;
    unsigned long status_user_tag = 0;
    PINT_client_io_ctx *cur_ctx = NULL;
    PINT_sm_msgpair_state *msg = NULL;

    gossip_debug(GOSSIP_IO_DEBUG,
                 "- complete_context_send_or_recv called\n");

    assert(sm_p && js_p);
    assert(sm_p->op == PVFS_SYS_IO);

    status_user_tag = (unsigned long)js_p->status_user_tag;

    if (STATUS_USER_TAG_TYPE(
            status_user_tag, IO_SM_PHASE_REQ_MSGPAIR_RECV))
    {
        index = STATUS_USER_TAG_GET_INDEX(
            status_user_tag, IO_SM_PHASE_REQ_MSGPAIR_RECV);

        gossip_debug(GOSSIP_IO_DEBUG, "got a recv completion with "
                     "context index %d\n", index);

        cur_ctx = &sm_p->u.io.contexts[index];
        assert(cur_ctx);

        msg = &sm_p->msgarray[index];
        msg->recv_id = 0;
        msg->recv_status = *js_p;

        assert(msg->recv_status.error_code <= 0);
        assert(msg->recv_status.actual_size <= msg->max_resp_sz);

        cur_ctx->msg_recv_in_progress = 0;
        sm_p->u.io.msgpair_completion_count--;

        ret = IO_RECV_COMPLETED;
    }
    else if (STATUS_USER_TAG_TYPE(
                 status_user_tag, IO_SM_PHASE_REQ_MSGPAIR_SEND))
    {
        index = STATUS_USER_TAG_GET_INDEX(
            status_user_tag, IO_SM_PHASE_REQ_MSGPAIR_RECV);

        gossip_debug(GOSSIP_IO_DEBUG, "got a send completion with "
                     "context index %d\n", index);

        cur_ctx = &sm_p->u.io.contexts[index];
        assert(cur_ctx);

        msg = &sm_p->msgarray[index];
        msg->send_id = 0;
        msg->send_status = *js_p;

        assert(msg->send_status.error_code <= 0);

        cur_ctx->msg_send_in_progress = 0;
        sm_p->u.io.msgpair_completion_count--;

        ret = 0;
    }
    return ret;
}

static inline int process_context_recv(
    PINT_client_io_ctx *cur_ctx,
    struct PINT_decoded_msg *decoded_resp,
    struct PVFS_server_resp **resp)
{
    int ret = -PVFS_EINVAL;

    gossip_debug(GOSSIP_IO_DEBUG, "- process_context_recv called\n");

    assert(cur_ctx && cur_ctx->msg && decoded_resp && resp);

    ret = PINT_serv_decode_resp(
        cur_ctx->msg->fs_id, cur_ctx->msg->encoded_resp_p, decoded_resp,
        &cur_ctx->msg->svr_addr,
        cur_ctx->msg->recv_status.actual_size, resp);

    if (ret)
    {
        PVFS_perror("PINT_server_decode_resp failed", ret);
        return ret;
    }

    assert((*resp)->status < 1);
    cur_ctx->msg->op_status = (*resp)->status;

    if (cur_ctx->msg->recv_status.error_code || cur_ctx->msg->op_status)
    {
        gossip_debug(
            GOSSIP_IO_DEBUG, "  error %d with status %d related "
            "to response from context %p; not submitting flow.\n",
            cur_ctx->msg->recv_status.error_code,
            cur_ctx->msg->op_status, cur_ctx);

        if (cur_ctx->msg->recv_status.error_code)
        {
            PVFS_perror_gossip(
                "process_context_recv (recv_status.error_code)",
                cur_ctx->msg->recv_status.error_code);
            ret = cur_ctx->msg->recv_status.error_code;
        }
        else if (cur_ctx->msg->op_status)
        {
            PVFS_perror_gossip("process_context_recv (op_status)",
                               cur_ctx->msg->op_status);
            ret = cur_ctx->msg->op_status;
        }

        PINT_serv_free_msgpair_resources(
            &cur_ctx->msg->encoded_req, cur_ctx->msg->encoded_resp_p,
            decoded_resp, &cur_ctx->msg->svr_addr,
            cur_ctx->msg->max_resp_sz);
    }
    return ret;
}

static inline int build_context_flow(
    PINT_client_sm *sm_p,
    PINT_client_io_ctx *cur_ctx,
    PVFS_object_attr *attr,
    struct PVFS_server_resp *resp)
{
    gossip_debug(GOSSIP_IO_DEBUG, "- build_context_flow called\n");

    if (!sm_p || !cur_ctx || !attr || !resp)
    {
        return -PVFS_EINVAL;
    }

    gossip_debug(GOSSIP_IO_DEBUG, "* mem req size is %Ld, "
                 "file_req size is %Ld (bytes)\n",
                 Ld(PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.mem_req)),
                 Ld(PINT_REQUEST_TOTAL_BYTES(sm_p->u.io.file_req)));

    /* must reset the error_code and internal PINT_distribute fields
     * in case of a retry */
    PINT_flow_reset(&cur_ctx->flow_desc);

    cur_ctx->flow_desc.file_data.fsize = resp->u.io.bstream_size;
    cur_ctx->flow_desc.file_data.dist = attr->u.meta.dist;
    cur_ctx->flow_desc.file_data.server_nr = cur_ctx->server_nr;
    cur_ctx->flow_desc.file_data.server_ct = attr->u.meta.dfile_count;

    cur_ctx->flow_desc.file_req = sm_p->u.io.file_req;
    cur_ctx->flow_desc.file_req_offset = sm_p->u.io.file_req_offset;

    cur_ctx->flow_desc.mem_req = sm_p->u.io.mem_req;

    cur_ctx->flow_desc.tag = cur_ctx->session_tag;
    cur_ctx->flow_desc.type = sm_p->u.io.flowproto_type;
    cur_ctx->flow_desc.user_ptr = NULL;

    gossip_debug(GOSSIP_IO_DEBUG, "  bstream_size = %Ld, datafile "
                 "nr=%d, ct=%d, file_req_off = %Ld\n",
                 Ld(cur_ctx->flow_desc.file_data.fsize),
                 cur_ctx->flow_desc.file_data.server_nr,
                 cur_ctx->flow_desc.file_data.server_ct,
                 Ld(cur_ctx->flow_desc.file_req_offset));

    if (sm_p->u.io.io_type == PVFS_IO_READ)
    {
        cur_ctx->flow_desc.file_data.extend_flag = 0;
        cur_ctx->flow_desc.src.endpoint_id = BMI_ENDPOINT;
        cur_ctx->flow_desc.src.u.bmi.address = cur_ctx->msg->svr_addr;
        cur_ctx->flow_desc.dest.endpoint_id = MEM_ENDPOINT;
        cur_ctx->flow_desc.dest.u.mem.buffer = sm_p->u.io.buffer;
    }
    else
    {
        assert(sm_p->u.io.io_type == PVFS_IO_WRITE);

        cur_ctx->flow_desc.file_data.extend_flag = 1;
        cur_ctx->flow_desc.src.endpoint_id = MEM_ENDPOINT;
        cur_ctx->flow_desc.src.u.mem.buffer = sm_p->u.io.buffer;
        cur_ctx->flow_desc.dest.endpoint_id = BMI_ENDPOINT;
        cur_ctx->flow_desc.dest.u.bmi.address = cur_ctx->msg->svr_addr;
    }
    return 0;
}

static inline int process_context_recv_and_post_flow(
    PINT_client_sm *sm_p,
    job_status_s *js_p, PINT_client_io_ctx **out_ctx)
{
    int ret = -PVFS_EINVAL, index = 0;
    unsigned long status_user_tag = 0;
    struct PINT_decoded_msg decoded_resp;
    struct PVFS_server_resp *resp = NULL;
    PVFS_object_attr *attr = NULL;
    PINT_client_io_ctx *cur_ctx = NULL;
    struct server_configuration_s *server_config = NULL;

    gossip_debug(GOSSIP_IO_DEBUG,
                 "- process_context_recv_and_post_flow called\n");

    assert(sm_p && js_p);
    assert(STATUS_USER_TAG_TYPE(
               status_user_tag, IO_SM_PHASE_REQ_MSGPAIR_RECV));

    status_user_tag = (unsigned long)js_p->status_user_tag;

    index = STATUS_USER_TAG_GET_INDEX(
        status_user_tag, IO_SM_PHASE_REQ_MSGPAIR_RECV);

    cur_ctx = &sm_p->u.io.contexts[index];
    assert(cur_ctx && cur_ctx->msg);
    *out_ctx = cur_ctx;

    if (js_p->error_code)
    {
        {
            char buf[1024];
            PVFS_strerror_r(js_p->error_code, buf, sizeof(buf));
            buf[sizeof(buf)-1] = '\0';
            gossip_debug(GOSSIP_IO_DEBUG, "%s: entered with error: %s\n",
              __func__, buf);
        }
        return js_p->error_code;
    }

    ret = process_context_recv(cur_ctx, &decoded_resp, &resp);
    if (ret)
    {
        {
            char buf[1024];
            PVFS_strerror_r(js_p->error_code, buf, sizeof(buf));
            buf[sizeof(buf)-1] = '\0';
            gossip_debug(GOSSIP_IO_DEBUG, "%s: failed: %s\n", __func__, buf);
        }
        return ret;
    }

    attr = &sm_p->getattr.attr;
    assert(attr);

    ret = build_context_flow(sm_p, cur_ctx, attr, resp);
    if (ret < 0)
    {
        PVFS_perror_gossip("build_context_flow failed", ret);
        return ret;
    }

    ret = PINT_serv_free_msgpair_resources(
        &cur_ctx->msg->encoded_req, cur_ctx->msg->encoded_resp_p,
        &decoded_resp, &cur_ctx->msg->svr_addr,
        cur_ctx->msg->max_resp_sz);

    if (ret)
    {
        PVFS_perror_gossip("PINT_serv_free_msgpair_resources "
                           "failed", ret);
        return ret;
    }

    if (sm_p->u.io.io_type == PVFS_IO_WRITE)
    {
        gossip_debug(GOSSIP_IO_DEBUG, "  preposting write "
                     "ack for context %p.\n", cur_ctx);

        cur_ctx->write_ack.max_resp_sz = PINT_encode_calc_max_size(
            PINT_ENCODE_RESP, PVFS_SERV_WRITE_COMPLETION,
            sm_p->u.io.encoding);
        cur_ctx->write_ack.encoded_resp_p = BMI_memalloc(
            cur_ctx->msg->svr_addr, cur_ctx->write_ack.max_resp_sz,
            BMI_RECV);

        if (!cur_ctx->write_ack.encoded_resp_p)
        {
            gossip_err("BMI_memalloc (for write ack) failed\n");
            return -PVFS_ENOMEM;
        }

        /*
          we're pre-posting the final write ack here, even though it's
          ahead of the flow phase; reads are at the flow phase.

          the timeout used here is a scaling one that needs to be long
          enough for the entire flow to occur
        */
        status_user_tag = ((4 * cur_ctx->index) + IO_SM_PHASE_FINAL_ACK);

        /*
          pre-post this recv with an infinite timeout and adjust it
          after the flow completes since we don't know how long a flow
          can take at this point
        */ 
        ret = job_bmi_recv(
            cur_ctx->msg->svr_addr, cur_ctx->write_ack.encoded_resp_p,
            cur_ctx->write_ack.max_resp_sz, cur_ctx->session_tag,
            BMI_PRE_ALLOC, sm_p, status_user_tag,
            &cur_ctx->write_ack.recv_status, &cur_ctx->write_ack.recv_id,
            pint_client_sm_context, JOB_TIMEOUT_INF);

        if (ret < 0)
        {
            gossip_err("job_bmi_recv (write ack) failed\n");
            return ret;
        }

        assert(ret == 0);
        cur_ctx->write_ack_has_been_posted = 1;
        cur_ctx->write_ack_in_progress = 1;
        sm_p->u.io.write_ack_completion_count++;
    }

    status_user_tag = ((4 * cur_ctx->index) + IO_SM_PHASE_FLOW);

    server_config = PINT_get_server_config_struct(sm_p->object_ref.fs_id);
    ret = job_flow(
        &cur_ctx->flow_desc, sm_p, status_user_tag,
        &cur_ctx->flow_status, &cur_ctx->flow_job_id,
        pint_client_sm_context,
        server_config->client_job_flow_timeout);
    PINT_put_server_config_struct(server_config);

    if (ret < 0)
    {
        gossip_err("job_flow failed\n");
        return ret;
    }
    else if (ret == 1)
    {
        gossip_debug(GOSSIP_IO_DEBUG, "  flow for context %p "
                     "completed immediately\n", cur_ctx);
        assert(cur_ctx->flow_status.error_code == 0);
    }
    else
    {
        gossip_debug(GOSSIP_IO_DEBUG, "  posted flow for "
                     "context %p\n", cur_ctx);

        cur_ctx->flow_has_been_posted = 1;
        cur_ctx->flow_in_progress = 1;
        sm_p->u.io.flow_completion_count++;
    }
    return ret;
}

static inline int check_context_status(
    PINT_client_io_ctx *cur_ctx,
    int io_type,
    PVFS_size *total_size)
{
    int ret = 0;

    gossip_debug(GOSSIP_IO_DEBUG, "- check_context_status called\n");

    assert(cur_ctx && cur_ctx->msg && total_size);

    if (cur_ctx->msg->send_status.error_code)
    {
        gossip_debug(GOSSIP_IO_DEBUG,
                     "  error (%d) in msgpair send for context %p\n",
                     cur_ctx->msg->send_status.error_code, cur_ctx);
        ret = cur_ctx->msg->send_status.error_code;
    }
    else if (cur_ctx->msg->recv_status.error_code)
    {
        gossip_debug(GOSSIP_IO_DEBUG,
                     "  error (%d) in msgpair recv for context %p\n",
                     cur_ctx->msg->recv_status.error_code, cur_ctx);
        ret = cur_ctx->msg->recv_status.error_code;
    }
    else if (cur_ctx->flow_status.error_code)
    {
        gossip_debug(GOSSIP_IO_DEBUG,
                     "  error (%d) in flow for context %p\n",
                     cur_ctx->flow_status.error_code, cur_ctx);
        PINT_flow_reset(&cur_ctx->flow_desc);
        ret = cur_ctx->flow_status.error_code;
    }
    else if (io_type == PVFS_IO_READ)
    {
        gossip_debug(
            GOSSIP_IO_DEBUG, "  %Ld bytes read from context %p\n",
            Ld(cur_ctx->flow_desc.total_transfered), cur_ctx);

        /* size for reads are reported in the flow */
        *total_size += cur_ctx->flow_desc.total_transfered;

        /*
          we can't reset the flow here in case we have to do a zero
          fill adjustment that we haven't detected yet
        */
    }
    else if (io_type == PVFS_IO_WRITE)
    {
        if (cur_ctx->write_ack.recv_status.error_code)
        {
            gossip_debug(
                GOSSIP_IO_DEBUG,
                "  error (%d) in final ack for context %p\n",
                cur_ctx->write_ack.recv_status.error_code, cur_ctx);

            assert(cur_ctx->write_ack_has_been_posted);
            ret = cur_ctx->write_ack.recv_status.error_code;
        }
        else if (cur_ctx->write_ack_has_been_posted)
        {
            struct PINT_decoded_msg decoded_resp;
            struct PVFS_server_resp *resp = NULL;
            /*
              size for writes are reported in the final ack, but we
              have to decode it first
            */
            ret = PINT_serv_decode_resp(
                cur_ctx->msg->fs_id, cur_ctx->write_ack.encoded_resp_p,
                &decoded_resp, &cur_ctx->msg->svr_addr,
                cur_ctx->write_ack.recv_status.actual_size, &resp);
            if (ret == 0)
            {
                gossip_debug(
                    GOSSIP_IO_DEBUG,
                    "  %Ld bytes written to context %p\n",
                    Ld(resp->u.write_completion.total_completed),
                    cur_ctx);

                *total_size += resp->u.write_completion.total_completed;

                PINT_decode_release(&decoded_resp, PINT_DECODE_RESP);
            }
            else
            {
                PVFS_perror_gossip("PINT_serv_decode_resp failed", ret);
            }

            PINT_flow_reset(&cur_ctx->flow_desc);
            BMI_memfree(cur_ctx->msg->svr_addr,
                        cur_ctx->write_ack.encoded_resp_p,
                        cur_ctx->write_ack.max_resp_sz, BMI_RECV);
        }
    }
    return ret;
}

/*
  determines what subset of the datafiles actually contain data that
  we are interested in for this request. returns 0 on success,
  -PVFS_error on failure
*/
static int io_find_target_datafiles(
    PVFS_Request mem_req,
    PVFS_Request file_req,
    PVFS_offset file_req_offset,
    PINT_dist *dist_p,
    PVFS_handle *input_handle_array,
    int input_handle_count,
    int *handle_index_array,
    int *handle_index_out_count)
{
    int ret = -PVFS_EINVAL, i = 0;
    struct PINT_Request_state *req_state = NULL;
    struct PINT_Request_state *mem_req_state = NULL;
    PINT_request_file_data tmp_file_data;
    PINT_Request_result tmp_result;

    gossip_debug(GOSSIP_IO_DEBUG, "- io_find_target_datafiles called\n");

    if (!handle_index_array || !handle_index_out_count)
    {
        return ret;
    }
    *handle_index_out_count = 0;

    req_state = PINT_new_request_state(file_req);
    if (!req_state)
    {
        return -PVFS_ENOMEM;
    }
    mem_req_state = PINT_new_request_state(mem_req);
    if (!mem_req_state)
    {
        PINT_free_request_state(req_state);
        return -PVFS_ENOMEM;
    }

    tmp_file_data.dist = dist_p;
    tmp_file_data.server_ct = input_handle_count;
    tmp_file_data.extend_flag = 1;

    for(i = 0; i < input_handle_count; i++)
    {
        /* NOTE: we don't have to give an accurate file size here, as
         * long as we set the extend flag to tell the I/O req
         * processor to continue past eof if needed
         */
        tmp_file_data.fsize = 0;  
        tmp_file_data.server_nr = i;

        PINT_REQUEST_STATE_RESET(req_state);
        PINT_REQUEST_STATE_RESET(mem_req_state);

        /* if a file datatype offset was specified, go ahead and skip
         * ahead before calculating
         */
        if (file_req_offset)
        {
            PINT_REQUEST_STATE_SET_TARGET(req_state, file_req_offset);
        }

        PINT_REQUEST_STATE_SET_FINAL(req_state,
            file_req_offset+PINT_REQUEST_TOTAL_BYTES(mem_req));

        memset(&tmp_result, 0, sizeof(PINT_Request_result));
        tmp_result.bytemax = 1;
        tmp_result.segmax = 1;

        /* PINT_process_request() returns number of bytes processed */
        ret = PINT_process_request(
            req_state, mem_req_state, &tmp_file_data,
            &tmp_result, PINT_CKSIZE);
        if (ret < 0)
        {
            PINT_free_request_state(mem_req_state);
            PINT_free_request_state(req_state);
            return ret;
        }

        /* check if we found data that belongs to this handle */
        if (tmp_result.bytes != 0)
        {
            assert(tmp_result.bytes > 0);

            handle_index_array[(*handle_index_out_count)++] = i;

            gossip_debug(GOSSIP_IO_DEBUG, "%s: "
                         "datafile[%d] might have data (out=%d)\n",
                         __func__, i, *handle_index_out_count);
        }
    }
    PINT_free_request_state(req_state);
    PINT_free_request_state(mem_req_state);

    return 0;
}

/*
 * Local variables:
 *  mode: c
 *  c-indent-level: 4
 *  c-basic-offset: 4
 * End:
 *
 * vim: ft=c ts=8 sts=4 sw=4 expandtab
 */
