@article {2757, title = {The Influence of Computerized Adaptive Testing on Psychometric Theory and Practice}, journal = {Journal of Computerized Adaptive Testing}, volume = {11}, year = {2024}, abstract = {

The major premise of this article is that part of the stimulus for the evolution of psychometric theory since the 1950s was the introduction of the concept of computerized adaptive testing (CAT) or its earlier non-CAT variations. The conceptual underpinnings of CAT that had the most influence on psychometric theory was the shift of emphasis from the test (or test score) as the focus of analysis to the test item (or item score). The change in focus allowed a change in the way that test results are conceived of as measurements. It also resolved the conflict among a number of ideas that were present in the early work on psychometric theory. Some of the conflicting ideas are summarized below to show how work on the development of CAT resolved some of those conflicts.

}, keywords = {computerized adaptive testing, Item Response Theory, paradigm shift, scaling theory, test design}, issn = {2165-6592}, doi = {10.7333/2403-1101001}, url = {https://jcatpub.net/index.php/jcat/issue/view/34/9}, author = {Reckase, Mark D.} } @article {2752, title = {An Extended Taxonomy of Variants of Computerized Adaptive Testing}, journal = {Journal of Computerized Adaptive Testing}, volume = {10}, year = {2023}, keywords = {Adaptive Testing, evidence-centered design, Item Response Theory, knowledge-based model construction, missingness}, issn = {2165-6592}, doi = {10.7333/2302-100101}, author = {Roy Levy and John T. Behrens and Robert J. Mislevy} } @article {2751, title = {The (non)Impact of Misfitting Items in Computerized Adaptive Testing}, journal = {Journal of Computerized Adaptive Testing}, volume = {9}, year = {2022}, keywords = {computerized adaptive testing, item fit, three-parameter logistic model}, doi = {10.7333/2211-0902008}, url = {https://jcatpub.net/index.php/jcat/issue/view/26}, author = {Christine E. DeMars} } @article {2717, title = {Time-Efficient Adaptive Measurement of Change}, journal = {Journal of Computerized Adaptive Testing}, volume = {7}, year = {2019}, pages = {15-34}, abstract = {

The adaptive measurement of change (AMC) refers to the use of computerized adaptive testing (CAT) at multiple occasions to efficiently assess a respondent\’s improvement, decline, or sameness from occasion to occasion. Whereas previous AMC research focused on administering the most informative item to a respondent at each stage of testing, the current research proposes the use of Fisher information per time unit as an item selection procedure for AMC. The latter procedure incorporates not only the amount of information provided by a given item but also the expected amount of time required to complete it. In a simulation study, the use of Fisher information per time unit item selection resulted in a lower false positive rate in the majority of conditions studied, and a higher true positive rate in all conditions studied, compared to item selection via Fisher information without accounting for the expected time taken. Future directions of research are suggested.

}, keywords = {adaptive measurement of change, computerized adaptive testing, Fisher information, item selection, response-time modeling}, issn = {2165-6592}, doi = {10.7333/1909-0702015}, url = {http://iacat.org/jcat/index.php/jcat/article/view/73/35}, author = {Matthew Finkelman and Chun Wang} } @conference {2651, title = {Adaptive Item and Feedback Selection in Personalized Learning with a Network Approach}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Personalized learning is a term used to describe educational systems that adapt student-specific curriculum sequencing, pacing, and presentation based on their unique backgrounds, knowledge, preferences, interests, and learning goals. (Chen, 2008; Netcoh, 2016). The technological approach to personalized learning provides data-driven models to incorporate these adaptations automatically. Examples of applications include online learning systems, educational games, and revision-aid systems. In this study we introduce Bayesian networks as a methodology to implement an adaptive framework within a personalized learning environment. Existing ideas from Computerized Adaptive Testing (CAT) with Item Response Theory (IRT), where choices about content provision are based on maximizing information, are related to the goals of personalized learning environments. Personalized learning entails other goals besides efficient ability estimation by maximizing information, such as an adaptive configuration of preferences and feedback to the student. These considerations will be discussed and their application in networks will be illustrated.

Adaptivity in Personalized Learning.In standard CAT\’s there is a focus on selecting items that provide maximum information about the ability of an individual at a certain point in time (Van der Linden \& Glas, 2000). When learning is the main goal of testing, alternative adaptive item selection methods were explored by Eggen (2012). The adaptive choices made in personalized learning applications require additional adaptivity with respect to the following aspects; the moment of feedback, the kind of feedback, and the possibility for students to actively influence the learning process.

Bayesian Networks and Personalized Learning.Personalized learning aims at constructing a framework to incorporate all the aspects mentioned above. Therefore, the goal of this framework is not only to focus on retrieving ability estimates by choosing items on maximum information, but also to construct a framework that allows for these other factors to play a role. Plajner and Vomlel (2016) have already applied Bayesian Networks to adaptive testing, selecting items with help of entropy reduction. Almond et al. (2015) provide a reference work on Bayesian Networks in Educational Assessment. Both acknowledge the potential of the method in terms of features such as modularity options to build finer-grained models. IRT does not allow to model sub-skills very easily and to gather information on fine-grained level, due to its dependency on the assumption of generally one underlying trait. The local independence assumption in IRT implies being interested in mainly the student\’s overall ability on the subject of interest. When the goal is to improve student\’s learning, we are not just interested in efficiently coming to their test score on a global subject. One wants a model that is able to map\ educational problems and talents in detail over the whole educational program, while allowing for dependency between items. The moment in time can influence topics to be better mastered than others, and this is exactly what we can to get out of a model. The possibility to model flexible structures, estimate abilities on a very detailed level for sub-skills and to easily incorporate other variables such as feedback in Bayesian Networks makes it a very promising method for making adaptive choices in personalized learning. It is shown in this research how item and feedback selection can be performed with help of the promising Bayesian Networks. A student involvement possibility is also introduced and evaluated.

References

Almond, R. G., Mislevy, R. J., Steinberg, L. S., Yan, D., \& Williamson, D. M. (2015). Bayesian Networks in Educational Assessment. Test. New York: Springer Science+Business Media. http://doi.org/10.1007/978-0-387-98138-3

Eggen, T.J.H.M. (2012) Computerized Adaptive Testing Item Selection in Computerized Adaptive Learning Systems. In: Eggen. TJHM \& Veldkamp, BP.. (Eds). Psychometrics in Practice at RCEC. Enschede: RCEC

Netcoh, S. (2016, March). \“What Do You Mean by \‘Personalized Learning?\’. Croscutting Conversations in Education \– Research, Reflections \& Practice. Blogpost.

Plajner, M., \& Vomlel, J. (2016). Student Skill Models in Adaptive Testing. In Proceedings of the Eighth International Conference on Probabilistic Graphical Models (pp. 403-414).

Van der Linden, W. J., \& Glas, C. A. (2000). Computerized adaptive testing: Theory and practice. Dordrecht: Kluwer Academic Publishers.

Session Video

}, keywords = {feedback selection, item selection, network approach, personalized learning}, author = {Nikky van Buuren and Hendrik Straat and Theo Eggen and Jean-Paul Fox} } @conference {2658, title = {Analysis of CAT Precision Depending on Parameters of the Item Pool}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The purpose of this research project is to analyze the measurement precision of a latent variable depending on parameters of the item pool. The influence of the following factors is analyzed:

Factor A \– range of variation of items in the pool. This factor varies on three levels with the following ranges in logits: a1 \– [-3.0; +3.0], a2 - [-4.0; +4.0], a3 - [-5.0; +5.0].

Factor B \– number of items in the pool. The factor varies on six levels with the following number of items for every factor: b1 - 128, b2 - 256, b3 \– 512, b4 - 1024, b5 \– 2048, b6 \– 4096. The items are evenly distributed in each of the variation ranges.

Factor C \– examinees\’ proficiency varies at 30 levels (c1, c2, \…, c30), which are evenly distributed in the range [-3.0; +3.0] logit.

The investigation was based on a simulation experiment within the framework of the theory of latent variables.

Response Y is the precision of measurement of examinees\’ proficiency, which is calculated as the difference between the true levels of examinees\’ proficiency and estimates obtained by means of adaptive testing. Three factor ANOVA was used for data processing.

The following results were obtained:

1. Factor A is significant. Ceteris paribus, the greater the range of variation of items in the pool, the higher the estimation precision is.

2. Factor B is significant. Ceteris paribus, the greater the number of items in the pool, the higher the estimation precision is.

3. Factor C is statistically insignificant at level \α = .05. It means that the precision of estimation of examinees\’ proficiency is the same within the range of their variation.

4. The only significant interaction among all interactions is AB. The significance of this interaction is explained by the fact that increasing the number of items in the pool decreases the effect of the range of variation of items in the pool.\ 

Session Video

}, keywords = {CAT, Item parameters, Precision}, url = {https://drive.google.com/file/d/1Bwe58kOQRgCSbB8x6OdZTDK4OIm3LQI3/view?usp=drive_web}, author = {Anatoly Maslak and Stanislav Pozdniakov} } @conference {2657, title = {Developing a CAT: An Integrated Perspective}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Most resources on computerized adaptive testing (CAT) tend to focus on psychometric aspects such as mathematical formulae for item selection or ability estimation. However, development of a CAT assessment requires a holistic view of project management, financials, content development, product launch and branding, and more. This presentation will develop such a holistic view, which serves several purposes, including providing a framework for validity, estimating costs and ROI, and making better decisions regarding the psychometric aspects.

Thompson and Weiss (2011) presented a 5-step model for developing computerized adaptive tests (CATs). This model will be presented and discussed as the core of this holistic framework, then applied to real-life examples. While most CAT research focuses on developing new quantitative algorithms, this presentation is instead intended to help researchers evaluate and select algorithms that are most appropriate for their needs. It is therefore ideal for practitioners that are familiar with the basics of item response theory and CAT, and wish to explore how they might apply these methodologies to improve their assessments.

Steps include:

1. Feasibility, applicability, and planning studies

2. Develop item bank content or utilize existing bank

3. Pretest and calibrate item bank

4. Determine specifications for final CAT

5. Publish live CAT.

So, for example, Step 1 will contain simulation studies which estimate item bank requirements, which then can be used to determine costs of content development, which in turn can be integrated into an estimated project cost timeline. Such information is vital in determining if the CAT should even be developed in the first place.

References

Thompson, N. A., \& Weiss, D. J. (2011). A Framework for the Development of Computerized Adaptive Tests. Practical Assessment, Research \& Evaluation, 16(1). Retrieved from http://pareonline.net/getvn.asp?v=16\&n=1.

Session Video

}, keywords = {CAT Development, integrated approach}, url = {https://drive.google.com/open?id=1Jv8bpH2zkw5TqSMi03e5JJJ98QtXf-Cv}, author = {Nathan Thompson} } @conference {2656, title = {Efficiency of Item Selection in CD-CAT Based on Conjunctive Bayesian Network Modeling Hierarchical attributes}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Cognitive diagnosis models (CDM) aim to diagnosis examinee\’s mastery status of multiple fine-grained skills. As new development for cognitive diagnosis methods emerges, much attention is given to cognitive diagnostic computerized adaptive testing (CD-CAT) as well. The topics such as item selection methods, item exposure control strategies, and online calibration methods, which have been wellstudied for traditional item response theory (IRT) based CAT, are also investigated in the context of CD-CAT (e.g., Xu, Chang, \& Douglas, 2003; Wang, Chang, \& Huebner, 2011; Chen et al., 2012).

In CDM framework, some researchers suggest to model structural relationship between cognitive skills, or namely, attributes. Especially, attributes can be hierarchical, such that some attributes must be acquired before the subsequent ones are mastered. For example, in mathematics, addition must be mastered before multiplication, which gives a hierarchy model for addition skill and multiplication skill. Recently, new CDMs considering attribute hierarchies have been suggested including the Attribute Hierarchy Method (AHM; Leighton, Gierl, \& Hunka, 2004) and the Hierarchical Diagnostic Classification Models (HDCM; Templin \& Bradshaw, 2014).

Bayesian Networks (BN), the probabilistic graphical models representing the relationship of a set of random variables using a directed acyclic graph with conditional probability distributions, also provide an efficient framework for modeling the relationship between attributes (Culbertson, 2016). Among various BNs, conjunctive Bayesian network (CBN; Beerenwinkel, Eriksson, \& Sturmfels, 2007) is a special kind of BN, which assumes partial ordering between occurrences of events and conjunctive constraints between them.

In this study, we propose using CBN for modeling attribute hierarchies and discuss the advantage of CBN for CDM. We then explore the impact of the CBN modeling on the efficiency of item selection methods for CD-CAT when the attributes are truly hierarchical. To this end, two simulation studies, one for fixed-length CAT and another for variable-length CAT, are conducted. For each studies, two attribute hierarchy structures with 5 and 8 attributes are assumed. Among the various item selection methods developed for CD-CAT, six algorithms are considered: posterior-weighted Kullback-Leibler index (PWKL; Cheng, 2009), the modified PWKL index (MPWKL; Kaplan, de la Torre, Barrada, 2015), Shannon entropy (SHE; Tatsuoka, 2002), mutual information (MI; Wang, 2013), posterior-weighted CDM discrimination index (PWCDI; Zheng \& Chang, 2016) and posterior-weighted attribute-level CDM discrimination index (PWACDI; Zheng \& Chang, 2016). The impact of Q-matrix structure, item quality, and test termination rules on the efficiency of item selection algorithms is also investigated. Evaluation measures include the attribute classification accuracy (fixed-length experiment) and test length of CDCAT until stopping (variable-length experiment).

The results of the study indicate that the efficiency of item selection is improved by directly modeling the attribute hierarchies using CBN. The test length until achieving diagnosis probability threshold was reduced to 50-70\% for CBN based CAT compared to the CD-CAT assuming independence of attributes. The magnitude of improvement is greater when the cognitive model of the test includes more attributes and when the test length is shorter. We conclude by discussing how Q-matrix structure, item quality, and test termination rules affect the efficiency.

References

Beerenwinkel, N., Eriksson, N., \& Sturmfels, B. (2007). Conjunctive bayesian networks. Bernoulli, 893- 909.

Chen, P., Xin, T., Wang, C., \& Chang, H. H. (2012). Online calibration methods for the DINA model with independent attributes in CD-CAT. Psychometrika, 77(2), 201-222.

Cheng, Y. (2009). When cognitive diagnosis meets computerized adaptive testing: CD-CAT. Psychometrika, 74(4), 619-632.

Culbertson, M. J. (2016). Bayesian networks in educational assessment: the state of the field. Applied Psychological Measurement, 40(1), 3-21.

Kaplan, M., de la Torre, J., \& Barrada, J. R. (2015). New item selection methods for cognitive diagnosis computerized adaptive testing. Applied Psychological Measurement, 39(3), 167-188.

Leighton, J. P., Gierl, M. J., \& Hunka, S. M. (2004). The attribute hierarchy method for cognitive assessment: a variation on Tatsuoka\&$\#$39;s rule-space approach. Journal of Educational Measurement, 41(3), 205-237.

Tatsuoka, C. (2002). Data analytic methods for latent partially ordered classification models. Journal of the Royal Statistical Society: Series C (Applied Statistics), 51(3), 337-350.

Templin, J., \& Bradshaw, L. (2014). Hierarchical diagnostic classification models: A family of models for estimating and testing attribute hierarchies. Psychometrika, 79(2), 317-339. Wang, C. (2013). Mutual information item selection method in cognitive diagnostic computerized adaptive testing with short test length. Educational and Psychological Measurement, 73(6), 1017-1035.

Wang, C., Chang, H. H., \& Huebner, A. (2011). Restrictive stochastic item selection methods in cognitive diagnostic computerized adaptive testing. Journal of Educational Measurement, 48(3), 255-273.

Xu, X., Chang, H., \& Douglas, J. (2003, April). A simulation study to compare CAT strategies for cognitive diagnosis. Paper presented at the annual meeting of National Council on Measurement in Education, Chicago.

Zheng, C., \& Chang, H. H. (2016). High-efficiency response distribution\–based item selection algorithms for short-length cognitive diagnostic computerized adaptive testing. Applied Psychological Measurement, 40(8), 608-624.

Session Video

}, keywords = {CD-CAT, Conjuctive Bayesian Network Modeling, item selection}, url = {https://drive.google.com/open?id=1RbO2gd4aULqsSgRi_VZudNN_edX82NeD}, author = {Soo-Yun Han and Yun Joo Yoo} } @conference {2663, title = {From Blueprints to Systems: An Integrated Approach to Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

For years, test blueprints have told test developers how many items and what types of items will be included in a test. Adaptive testing adopted this approach from paper testing, and it is reasonably useful. Unfortunately, \&$\#$39;how many items and what types of items\&$\#$39; are not all the elements one should consider when choosing items for an adaptive test. To fill in gaps, practitioners have developed tools to allow an adaptive test to behave appropriately (i.e. examining exposure control, content balancing, item drift procedures, etc.). Each of these tools involves the use of a separate process external to the primary item selection process.

The use of these subsidiary processes makes item selection less optimal and makes it difficult to prioritize aspects of selection. This discussion describes systems-based adaptive testing. This approach uses metadata concerning items, test takers and test elements to select items. These elements are weighted by the stakeholders to shape an expanded blueprint designed for adaptive testing.\ 

Session Video

}, keywords = {CAT, integrated approach, Keynote}, url = {https://drive.google.com/open?id=1CBaAfH4ES7XivmvrMjPeKyFCsFZOpQMJ}, author = {Gage Kingsbury and Tony Zara} } @conference {2631, title = {Generating Rationales to Support Formative Feedback in Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Computer adaptive testing offers many important benefits to support and promote life-long learning. Computers permit testing on-demand thereby allowing students to take the test at any time during instruction; items on computerized tests are scored immediately thereby providing students with instant feedback; computerized tests permit continuous administration thereby allowing students to have more choice about when they write their exams. But despite these important benefits, the advent of computer adaptive testing has also raised formidable challenges, particularly in the area of item development. Educators must have access to large numbers of diverse, high-quality test items to implement computerize adaptive testing because items are continuously administered to students. Hence, hundreds or even thousands of items are needed to develop the test item banks necessary for computer adaptive testing. Unfortunately, educational test items, as they are currently created, are time consuming and expensive to develop because each individual item is written, initially, by a content specialist and, then, reviewed, edited, and revised by groups of content specialists to ensure the items yield reliable and valid information. Hence, item development is one of the most important problems that must be solved before we can migrate to computer adaptive testing to support life-long learning because large numbers of high-quality, content-specific, test items are required.

One promising item development method that may be used to address this challenge is with automatic item generation. Automatic item generation is a relatively new but rapidly evolving research area where cognitive and psychometric modelling practices are used produce hundreds of new test items with the aid of computer technology. The purpose of our presentation is to describe a new methodology for generating both the items and the rationales required to solve each generated item in order to produce the feedback needed to support life-long learning. Our item generation methodology will first be described. To ensure our description is practical, the method will also be demonstrated using generated items from the health sciences to demonstrate how item generation can promote life-long learning for medical educators and practitioners.

Session Video

}, keywords = {Adaptive Testing, formative feedback, Item generation}, url = {https://drive.google.com/open?id=1O5KDFtQlDLvhNoDr7X4JO4arpJkIHKUP}, author = {Mark Gierl and Okan Bulut} } @conference {2647, title = {Grow a Tiger out of Your CAT }, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The main focus in the community of test developers and researchers is on improving adaptive test procedures and methodologies. Yet, the transition from research projects to larger-scale operational CATs is facing its own challenges. Usually, these operational CATs find their origin in government tenders. \“Scalability\”, \“Interoperability\” and \“Transparency\” are three keywords often found in these documents. Scalability is concerned with parallel system architectures which are based upon stateless selection algorithms. Design capacities often range from 10,000 to well over 100,000 concurrent students. Interoperability is implemented in standards like QTI, standards that were not designed with adaptive testing in mind. Transparency is being realized by open source software: the adaptive test should not be a black box. These three requirements often complicate the development of an adaptive test, or sometimes even conflict.

Session Video

}, keywords = {interoparability, Scalability, transparency}, author = {Angela Verschoor} } @conference {2672, title = {An Imputation Approach to Handling Incomplete Computerized Tests}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

As technology advances, computerized adaptive testing (CAT) is becoming increasingly popular as it allows tests to be tailored to an examinee\’s ability.\  Nevertheless, examinees might devise testing strategies to use CAT to their advantage.\  For instance, if only the items that examinees answer count towards their score, then a higher theta score might be obtained by spending more time on items at the beginning of the test and skipping items at the end if time runs out. This type of gaming can be discouraged if examinees\’ scores are lowered or \“penalized\” based on the amount of non-response.

The goal of this study was to devise a penalty function that would meet two criteria: 1) the greater the omit rate, the greater the penalty, and 2) examinees with the same ability and the same omit rate should receive the same penalty. To create the penalty, theta was calculated based on only the items the examinee responded to ( ).\  Next, the expected number correct score (EXR) was obtained using \ and the test characteristic curve. A penalized expected number correct score (E ) was obtained by multiplying EXR by the proportion of items the examinee responded to. Finally, the penalized theta ( ) was identified using the test characteristic curve. Based on\  \ and the item parameters ( ) of an unanswered item, the likelihood of a correct response, \ , is computed and employed to estimate the imputed score ( ) for the unanswered item.

Two datasets were used to generate tests with completion rates of 50\%, 80\%, and 90\%.\  The first dataset included real data where approximately 4,500 examinees responded to a 21 -item test which provided a baseline/truth. Sampling was done to achieve the three completion rate conditions. The second dataset consisted of simulated item scores for 50,000 simulees under a 1-2-4 multi-stage CAT design where each stage contained seven items. Imputed item scores for unanswered items were computed using a variety of values for G (and therefore T).\  Three other approaches to handling unanswered items were also considered: all correct (i.e., T = 0), all incorrect (i.e., T = 1), and random scoring (i.e., T = 0.5).

The current study investigated the impact on theta estimates resulting from the proposed approach to handling unanswered items in a fixed-length CAT. In real testing situations, when examinees do not finish a test, it is hard to tell whether they tried diligently but ran out of time or whether they attempted to manipulate the scoring engine.\  To handle unfinished tests with penalties, the proposed approach considers examinees\’ abilities and incompletion rates. The results of this study provide direction for psychometric practitioners when considering penalties for omitted responses.

Session Video

}, keywords = {CAT, imputation approach, incomplete computerized test}, url = {https://drive.google.com/open?id=1vznZeO3nsZZK0k6_oyw5c9ZTP8uyGnXh}, author = {Troy Chen and Chi-Yu Huang and Chunyan Liu} } @conference {2634, title = {Issues in Trait Range Coverage for Patient Reported Outcome Measure CATs - Extending the Ceiling for Above-average Physical Functioning}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The use of a measure which fails to cover the upper range of functioning may produce results which can lead to serious misinterpretation. Scores produced by such a measure may fail to recognize significant improvement, or may not be able to demonstrate functioning commensurate with an important milestone. Accurate measurement of this range is critical for the assessment of physically active adults, e.g., athletes recovering from injury and active military personnel who wish to return to active service. Alternatively, a PF measure with a low ceiling might fail to differentiate patients in rehabilitation who continue to improve, but for whom their score ceilings due to the measurement used.

The assessment of physical function (PF) has greatly benefited from modern psychometric theory and resulting scales, such as the Patient-Reported Outcomes Measurement Information System (PROMIS\®) PF instruments. While PROMIS PF has extended the range of function upwards relative to older \“legacy\” instruments, few PROMIS PF items asses high levels of function. We report here on the development of higher functioning items for the PROMIS PF bank.

An expert panel representing orthopedics, sports/military medicine, and rehabilitation reviewed existing instruments and wrote new items. After internal review, cognitive interviews were conducted with 24 individuals of average and high levels of physical function. The remaining candidate items were administered along with 50 existing PROMIS anchor items to an internet panel screened for low, average, and high levels of physical function (N = 1,600), as well as members of Boston-area gyms (N= 344). The resulting data was subjected to standard psychometric analysis, along with multiple linking methods to place the new items on the existing PF metric. The new items were added to the full PF bank for simulated computerized adaptive testing (CAT).

Item response data was collected on 54 candidate items. Items that exhibited local dependence (LD) or differential item functioning (DIF) related to gender, age, race, education, or PF status. These items were removed from consideration. Of the 50 existing PROMIS PF items, 31 were free of DIF and LD and used as anchors. The parameters for the remaining new candidate items were estimated twice: freelyestimated and linked with coefficients and fixed-anchor calibration. Both methods were comparable and had appropriate fit. The new items were added to the full PF bank for simulated CATs. The resulting CAT was able to extend the ceiling with high precision to a T-score of 68, suggesting accurate measurement for 97\% of the general population.

Extending the range of items by which PF is measured will substantially improve measurement quality, applicability, and efficiency. The bank has incorporated these extension items and is available for use in research and clinics for brief CAT administration (see www.healthmeasures.net). Future research projects should focus on recovery trajectories of the measure for individuals with above average function who are recovering from injury.

Session Video

}, keywords = {CAT, Issues, Patient Reported Outcome}, url = {https://drive.google.com/open?id=1ZC02F-dIyYovEjzpeuRdoXDiXMLFRuKb}, author = {Richard C. Gershon} } @conference {2646, title = {Item Pool Design and Evaluation}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Early work on CAT tended to use existing sets of items which came from fixed length test forms. These sets of items were selected to meet much different requirements than are needed for a CAT; decision making or covering a content domain. However, there was also some early work that suggested having items equally distributed over the range of proficiency that was of interest or concentrated at a decision point. There was also some work that showed that there was bias in proficiency estimates when an item pool was too easy or too hard. These early findings eventually led to work on item pool design and, more recently, on item pool evaluation. This presentation gives a brief overview of these topics to give some context for the following presentations in this symposium.

Session Video

}, keywords = {CAT, Item Pool Design}, url = {https://drive.google.com/open?id=1ZAsqm1yNZlliqxEHcyyqQ_vOSu20xxZs}, author = {Mark D Reckase and Wei He and Jing-Ru Xu and Xuechun Zhou} } @conference {2629, title = {Item Selection Strategies for Developing CAT in Indonesia}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niiagata Seiryo University}, organization = {Niiagata Seiryo University}, address = {Niigata Japan}, abstract = {

Recently, development of computerized testing in Indonesia is quiet promising for the future. Many government institutions used the technology for recruitment. Starting from Indonesian Army acknowledged the benefits of computerized adaptive testing (CAT) over conventional test administration, ones of the issues of selection the first item have taken place of attention. Due to CAT\’s basic philosophy, several methods can be used to select the first item such as educational level, ability estimation from item simulation, or other methods. In this case, the question is remains how apply the methods most effective in the context of constrained adaptive testing. This paper reviews such strategies that appeared in the relevant literature. The focus of this paper is on studies that have been conducted in order to evaluate the effectiveness of item selection strategies for dichotomous scoring. In this paper, also discusses the strength and weaknesses of each strategy group using examples from simulation studies. No new research is presented but rather a compendium of models is reviewed in term of learning in the newcomer context, a wide view of first item selection strategies.

}, keywords = {CAT, Indonesia, item selection strategies}, url = {https://www.youtube.com/watch?v=2KuFrRATq9Q}, author = {Istiani Chandra} } @article {2529, title = {Latent-Class-Based Item Selection for Computerized Adaptive Progress Tests}, journal = {Journal of Computerized Adaptive Testing}, volume = {5}, year = {2017}, pages = {22-43}, keywords = {computerized adaptive progress test, item selection method, Kullback-Leibler information, Latent class analysis, log-odds scoring}, issn = {2165-6592}, doi = {10.7333/1704-0502022}, url = {http://iacat.org/jcat/index.php/jcat/article/view/62/29}, author = {van Buuren, Nikky and Eggen, Theo J. H. M.} } @conference {2648, title = {New Challenges (With Solutions) and Innovative Applications of CAT}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Over the past several decades, computerized adaptive testing (CAT) has profoundly changed the administration of large-scale aptitude tests, state-wide achievement tests, professional licensure exams, and health outcome measures. While many challenges of CAT have been successfully addressed due to the continual efforts of researchers in the field, there are still many remaining, longstanding challenges that have yet to be resolved. This symposium will begin with three presentations, each of which provides a sound solution to one of the unresolved challenges. They are (1) item calibration when responses are \“missing not at random\” from CAT administration; (2) online calibration of new items when person traits have non-ignorable measurement error; (3) establishing consistency and asymptotic normality of latent trait estimation when allowing item response revision in CAT. In addition, this symposium also features innovative applications of CAT. In particular, there is emerging interest in using cognitive diagnostic CAT to monitor and detect learning progress (4th presentation). Last but not least, the 5th presentation illustrates the power of multidimensional polytomous CAT that permits rapid identification of hospitalized patients\’ rehabilitative care needs in\ health outcomes measurement. We believe this symposium covers a wide range of interesting and important topics in CAT.

Session Video

}, keywords = {CAT, challenges, innovative applications}, url = {https://drive.google.com/open?id=1Wvgxw7in_QCq_F7kzID6zCZuVXWcFDPa}, author = {Chun Wang and David J. Weiss and Xue Zhang and Jian Tao and Yinhong He and Ping Chen and Shiyu Wang and Susu Zhang and Haiyan Lin and Xiaohong Gao and Hua-Hua Chang and Zhuoran Shang} } @conference {2636, title = {New Results on Bias in Estimates due to Discontinue Rules in Intelligence Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The presentation provides new results on a form of adaptive testing that is used frequently in intelligence testing. In these tests, items are presented in order of increasing difficulty, and the presentation of items is adaptive in the sense that each subtest session is discontinued once a test taker produces a certain number of incorrect responses in sequence. The subsequent (not observed) responses are commonly scored as wrong for that subtest, even though the test taker has not seen these. Discontinuation rules allow a certain form of adaptiveness both in paper-based and computerbased testing, and help reducing testing time.

Two lines of research that are relevant are studies that directly assess the impact of discontinuation rules, and studies that more broadly look at the impact of scoring rules on test results with a large number of not administered or not reached items. He \& Wolf (2012) compared different ability estimation methods for this type of discontinuation rule adaptation of test length in a simulation study. However, to our knowledge there has been no rigorous analytical study of the underlying distributional changes of the response variables under discontinuation rules. It is important to point out that the results obtained by He \& Wolf (2012) agree with results presented by, for example, DeAyala, Plake \& Impara (2001) as well as Rose, von Davier \& Xu (2010) and Rose, von Davier \& Nagengast (2016) in that ability estimates are biased most when scoring the not observed responses as wrong. Discontinuation rules combined with scoring the non-administered items as wrong is used operationally in several major intelligence tests, so more research is needed in order to improve this particular type of adaptiveness in the testing practice.

The presentation extends existing research on adaptiveness by discontinue-rules in intelligence tests in multiple ways: First, a rigorous analytical study of the distributional properties of discontinue-rule scored items is presented. Second, an extended simulation is presented that includes additional alternative scoring rules as well as bias-corrected ability estimators that may be suitable to improve results for discontinue-rule scored intelligence tests.

References: DeAyala, R. J., Plake, B. S., \& Impara, J. C. (2001). The impact of omitted responses on the accuracy of ability estimation in item response theory. Journal of Educational Measurement, 38, 213-234.

He, W. \& Wolfe, E. W. (2012). Treatment of Not-Administered Items on Individually Administered Intelligence Tests. Educational and Psychological Measurement, Vol 72, Issue 5, pp. 808 \– 826. DOI: 10.1177/0013164412441937

Rose, N., von Davier, M., \& Xu, X. (2010). Modeling non-ignorable missing data with item response theory (IRT; ETS RR-10-11). Princeton, NJ: Educational Testing Service.

Rose, N., von Davier, M., \& Nagengast, B. (2016) Modeling omitted and not-reached items in irt models. Psychometrika. doi:10.1007/s11336-016-9544-7

Session Video

}, keywords = {Bias, CAT, Intelligence Testing}, author = {Matthias von Davier and Youngmi Cho and Tianshu Pan} } @conference {2661, title = {Using Computerized Adaptive Testing to Detect Students{\textquoteright} Misconceptions: Exploration of Item Selection}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Owning misconceptions impedes learning, thus detecting misconceptions through assessments is crucial to facilitate teaching. However, most computerized adaptive testing (CAT) applications to diagnose examinees\’ attribute profiles focus on whether examinees mastering correct concepts or not. In educational scenario, teachers and students have to figure out the misconceptions underlying incorrect answers after obtaining the scores from assessments and then correct the corresponding misconceptions. The Scaling Individuals and Classifying Misconceptions (SICM) models proposed by Bradshaw and Templin (2014) fill this gap. SICMs can identify a student\’s misconceptions directly from the distractors of multiple-choice questions and report whether s/he own the misconceptions or not. Simultaneously, SICM models are able to estimate a continuous ability within the item response theory (IRT) framework to fulfill the needs of policy-driven assessment systems relying on scaling examinees\’ ability. However, the advantage of providing estimations for two types of latent variables also causes complexity of model estimation. More items are required to achieve the same accuracies for both classification and estimation compared to dichotomous DCMs and to IRT, respectively. Thus, we aim to develop a CAT using the SICM models (SICM-CAT) to estimate students\’ misconceptions and continuous abilities simultaneously using fewer items than a linear test.

To achieve this goal, in this study, our research questions mainly focus on establishing several item selection rules that target on providing both accurate classification results and continuous ability estimations using SICM-CAT. The first research question is which information criterion to be used. The Kullback\–Leibler (KL) divergence is the first choice, as it can naturally combine the continuous and discrete latent variables. Based on this criterion, we propose an item selection index that can nicely integrate the two types of information. Based on this index, the items selected in real time could discriminate the examinee\’s current misconception profile and ability estimates from other possible estimates to the most extent. The second research question is about how to adaptively balance the estimations of the misconception profile and the continuous latent ability. Mimic the idea of the Hybrid Design proposed by Wang et al. (2016), we propose a design framework which makes the item selection transition from the group-level to the item-level. We aim to explore several design questions, such as how to select the transiting point and which latent variable estimation should be targeted first.

Preliminary results indicated that the SICM-CAT based on the proposed item selection index could classify examinees into different latent classes and measure their latent abilities compared with the random selection method more accurately and reliably under all the simulation conditions. We plan to compare different CAT designs based on our proposed item selection rules with the best linear test as the next step. We expect that the SICM-CAT is able to use shorter test length while retaining the same accuracies and reliabilities.

References

Bradshaw, L., \& Templin, J. (2014). Combining item response theory and diagnostic classification models: A psychometric model for scaling ability and diagnosing misconceptions. Psychometrika, 79(3), 403-425.

Wang, S., Lin, H., Chang, H. H., \& Douglas, J. (2016). Hybrid computerized adaptive testing: from group sequential design to fully sequential design. Journal of Educational Measurement, 53(1), 45-62.

Session Video

}, keywords = {CAT, incorrect answering, Student Misconception}, author = {Yawei Shen and Yu Bao and Shiyu Wang and Laine Bradshaw} } @article {2491, title = {Effect of Imprecise Parameter Estimation on Ability Estimation in a Multistage Test in an Automatic Item Generation Context }, journal = {Journal of Computerized Adaptive Testing}, volume = {4}, year = {2016}, pages = {1-18}, keywords = {Adaptive Testing, automatic item generation, errors in item parameters, item clones, multistage testing}, issn = {2165-6592 }, doi = {10.7333/1608-040101}, url = {http://iacat.org/jcat/index.php/jcat/article/view/59/27}, author = {Colvin, Kimberly and Keller, Lisa A and Robin, Frederic} } @article {2345, title = {Detecting Item Preknowledge in Computerized Adaptive Testing Using Information Theory and Combinatorial Optimization}, journal = {Journal of Computerized Adaptive Testing}, volume = {2}, year = {2014}, pages = {37-58}, keywords = {combinatorial optimization, hypothesis testing, item preknowledge, Kullback-Leibler divergence, simulated annealing., test security}, issn = {2165-6592}, doi = {10.7333/1410-0203037}, url = {http://www.iacat.org/jcat/index.php/jcat/article/view/36/18}, author = {Belov, D. I.} } @conference {2080, title = {Adaptive Item Calibration and Norming: Unique Considerations of a Global Deployment}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, keywords = {CAT, common item equating, Figural Reasoning Test, item calibration, norming}, author = {Alexander Schwall and Evan Sinar} } @article {2070, title = {catR: An R Package for Computerized Adaptive Testing}, journal = {Applied Psychological Measurement}, year = {2011}, abstract = {

Computerized adaptive testing (CAT) is an active current research field in psychometrics and educational measurement. However, there is very little software available to handle such adaptive tasks. The R package catR was developed to perform adaptive testing with as much flexibility as possible, in an attempt to provide a developmental and testing platform to the interested user. Several item-selection rules and ability estimators are implemented. The item bank can be provided by the user or randomly generated from parent distributions of item parameters. Three stopping rules are available. The output can be graphically displayed.

}, keywords = {computer program, computerized adaptive testing, Estimation, Item Response Theory}, doi = {10.1177/0146621611407482}, author = {Magis, D. and Ra{\^\i}che, G.} } @conference {2100, title = {Continuous Testing (an avenue for CAT research)}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Publishing an Adaptive Test

Problems with Publishing

Research Questions

}, keywords = {CAT, item filter, item filtration}, author = {G. Gage Kingsbury} } @conference {2099, title = {From Reliability to Validity: Expanding Adaptive Testing Practice to Find the Most Valid Score for Each Test Taker}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

CAT is an exception to the traditional conception of validity. It is one of the few examples of individualized testing. Item difficulty is tailored to each examinee. The intent, however, is increased efficiency. Focus on reliability (reduced standard error); Equivalence with paper \& pencil tests is valued; Validity is enhanced through improved reliability.

How Else Might We Individualize Testing Using CAT?

An ISV-Based View of Validity

Test Event -- An examinee encounters a series of items in a particular context.

CAT Goal: individualize testing to address CIV threats to score validity (i.e., maximize ISV).

Some Research Issues:

}, keywords = {CAT, CIV, construct-irrelevant variance, Individual Score Validity, ISV, low test taking motivation, Reliability, validity}, author = {Steven L. Wise} } @conference {2082, title = {Impact of Item Drift on Candidate Ability Estimation}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

For large operational pools, candidate ability estimates appear robust to item drift, especially under conditions that may represent \‘normal\’ amounts of drift. Even with \‘extreme\’ conditions of drift (e.g., 20\% of items drifting 1.00 logits), decision consistency was still high.

}, keywords = {item drift}, author = {Sarah Hagge and Ada Woo and Phil Dickison} } @conference {2106, title = {Item Selection Methods based on Multiple Objective Approaches for Classification of Respondents into Multiple Levels}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Is it possible to develop new item selection methods which take advantage of the fact that we want to classify into multiple categories? New methods: Taking multiple points on the ability scale into account; Based on multiple objective approaches.

Conclusions

}, keywords = {adaptive classification test, CAT, item selection, sequential classification test}, author = {Maaike van Groen and Theo Eggen and Bernard Veldkamp} } @conference {2108, title = {Optimal Calibration Designs for Computerized Adaptive Testing}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Optimaztion

How can we exploit the advantages of Balanced Block Design while keeping the logistics manageable?

Homogeneous Designs: Overlap between test booklets as regular as possible

Conclusions:

}, keywords = {balanced block design, CAT, item calibration, optimization, Rasch}, author = {Angela Verschoor} } @conference {2081, title = {Practitioner{\textquoteright}s Approach to Identify Item Drift in CAT}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, keywords = {CUSUM method, G2 statistic, IPA, item drift, item parameter drift, Lord{\textquoteright}s chi-square statistic, Raju{\textquoteright}s NCDIF}, author = {Huijuan Meng and Susan Steinkamp and Paul Jones and Joy Matthews-Lopez} } @article {2071, title = {Item Selection and Hypothesis Testing for the Adaptive Measurement of Change}, journal = {Applied Psychological Measurement}, volume = {34}, year = {2010}, pages = {238-254}, abstract = {

Assessing individual change is an important topic in both psychological and educational measurement. An adaptive measurement of change (AMC) method had previously been shown to exhibit greater efficiency in detecting change than conventional nonadaptive methods. However, little work had been done to compare different procedures within the AMC framework. This study introduced a new item selection criterion and two new test statistics for detecting change with AMC that were specifically designed for the paradigm of hypothesis testing. In two simulation sets, the new methods for detecting significant change improved on existing procedures by demonstrating better adherence to Type I error rates and substantially better power for detecting relatively small change.\ 

}, keywords = {change, computerized adaptive testing, individual change, Kullback{\textendash}Leibler information, likelihood ratio, measuring change}, doi = {10.1177/0146621609344844}, author = {Finkelman, M. D. and Weiss, D. J. and Kim-Kang, G.} } @article {112, title = {A mixed integer programming model for multiple stage adaptive testing}, journal = {European Journal of Operational Research}, volume = {193}, number = {2}, year = {2009}, note = {doi: DOI: 10.1016/j.ejor.2007.10.047}, pages = {342-350}, abstract = {The last decade has seen paper-and-pencil (P\&P) tests being replaced by computerized adaptive tests (CATs) within many testing programs. A CAT may yield several advantages relative to a conventional P\&P test. A CAT can determine the questions or test items to administer, allowing each test form to be tailored to a test taker{\textquoteright}s skill level. Subsequent items can be chosen to match the capability of the test taker. By adapting to a test taker{\textquoteright}s ability, a CAT can acquire more information about a test taker while administering fewer items. A Multiple Stage Adaptive test (MST) provides a means to implement a CAT that allows review before the administration. The MST format is a hybrid between the conventional P\&P and CAT formats. This paper presents mixed integer programming models for MST assembly problems. Computational results with commercial optimization software will be given and advantages of the models evaluated.}, keywords = {Education, Integer programming, Linear programming}, isbn = {0377-2217}, author = {Edmonds, J. and Armstrong, R. D.} } @article {88, title = {Assessing self-care and social function using a computer adaptive testing version of the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {89}, number = {4}, year = {2008}, note = {Coster, Wendy JHaley, Stephen MNi, PengshengDumas, Helene MFragala-Pinkham, Maria AK02 HD45354-01A1/HD/NICHD NIH HHS/United StatesR41 HD052318-01A1/HD/NICHD NIH HHS/United StatesR43 HD42388-01/HD/NICHD NIH HHS/United StatesComparative StudyResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2008 Apr;89(4):622-9.}, month = {Apr}, pages = {622-629}, edition = {2008/04/01}, abstract = {OBJECTIVE: To examine score agreement, validity, precision, and response burden of a prototype computer adaptive testing (CAT) version of the self-care and social function scales of the Pediatric Evaluation of Disability Inventory compared with the full-length version of these scales. DESIGN: Computer simulation analysis of cross-sectional and longitudinal retrospective data; cross-sectional prospective study. SETTING: Pediatric rehabilitation hospital, including inpatient acute rehabilitation, day school program, outpatient clinics; community-based day care, preschool, and children{\textquoteright}s homes. PARTICIPANTS: Children with disabilities (n=469) and 412 children with no disabilities (analytic sample); 38 children with disabilities and 35 children without disabilities (cross-validation sample). INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from prototype CAT applications of each scale using 15-, 10-, and 5-item stopping rules; scores from the full-length self-care and social function scales; time (in seconds) to complete assessments and respondent ratings of burden. RESULTS: Scores from both computer simulations and field administration of the prototype CATs were highly consistent with scores from full-length administration (r range, .94-.99). Using computer simulation of retrospective data, discriminant validity, and sensitivity to change of the CATs closely approximated that of the full-length scales, especially when the 15- and 10-item stopping rules were applied. In the cross-validation study the time to administer both CATs was 4 minutes, compared with over 16 minutes to complete the full-length scales. CONCLUSIONS: Self-care and social function score estimates from CAT administration are highly comparable with those obtained from full-length scale administration, with small losses in validity and precision and substantial decreases in administration time.}, keywords = {*Disability Evaluation, *Social Adjustment, Activities of Daily Living, Adolescent, Age Factors, Child, Child, Preschool, Computer Simulation, Cross-Over Studies, Disabled Children/*rehabilitation, Female, Follow-Up Studies, Humans, Infant, Male, Outcome Assessment (Health Care), Reference Values, Reproducibility of Results, Retrospective Studies, Risk Factors, Self Care/*standards/trends, Sex Factors, Sickness Impact Profile}, isbn = {1532-821X (Electronic)0003-9993 (Linking)}, author = {Coster, W. J. and Haley, S. M. and Ni, P. and Dumas, H. M. and Fragala-Pinkham, M. A.} } @article {241, title = {Binary items and beyond: a simulation of computer adaptive testing using the Rasch partial credit model}, journal = {Journal of Applied Measurement}, volume = {9}, number = {1}, year = {2008}, note = {Lange, RenseUnited StatesJournal of applied measurementJ Appl Meas. 2008;9(1):81-104.}, pages = {81-104}, edition = {2008/01/09}, abstract = {Past research on Computer Adaptive Testing (CAT) has focused almost exclusively on the use of binary items and minimizing the number of items to be administrated. To address this situation, extensive computer simulations were performed using partial credit items with two, three, four, and five response categories. Other variables manipulated include the number of available items, the number of respondents used to calibrate the items, and various manipulations of respondents{\textquoteright} true locations. Three item selection strategies were used, and the theoretically optimal Maximum Information method was compared to random item selection and Bayesian Maximum Falsification approaches. The Rasch partial credit model proved to be quite robust to various imperfections, and systematic distortions did occur mainly in the absence of sufficient numbers of items located near the trait or performance levels of interest. The findings further indicate that having small numbers of items is more problematic in practice than having small numbers of respondents to calibrate these items. Most importantly, increasing the number of response categories consistently improved CAT{\textquoteright}s efficiency as well as the general quality of the results. In fact, increasing the number of response categories proved to have a greater positive impact than did the choice of item selection method, as the Maximum Information approach performed only slightly better than the Maximum Falsification approach. Accordingly, issues related to the efficiency of item selection methods are far less important than is commonly suggested in the literature. However, being based on computer simulations only, the preceding presumes that actual respondents behave according to the Rasch model. CAT research could thus benefit from empirical studies aimed at determining whether, and if so, how, selection strategies impact performance.}, keywords = {*Data Interpretation, Statistical, *User-Computer Interface, Educational Measurement/*statistics \& numerical data, Humans, Illinois, Models, Statistical}, isbn = {1529-7713 (Print)1529-7713 (Linking)}, author = {Lange, R.} } @article {231, title = {Computerized adaptive testing in back pain: Validation of the CAT-5D-QOL}, journal = {Spine}, volume = {33}, number = {12}, year = {2008}, note = {Kopec, Jacek ABadii, MaziarMcKenna, MarioLima, Viviane DSayre, Eric CDvorak, MarcelResearch Support, Non-U.S. Gov{\textquoteright}tValidation StudiesUnited StatesSpineSpine (Phila Pa 1976). 2008 May 20;33(12):1384-90.}, month = {May 20}, pages = {1384-90}, edition = {2008/05/23}, abstract = {STUDY DESIGN: We have conducted an outcome instrument validation study. OBJECTIVE: Our objective was to develop a computerized adaptive test (CAT) to measure 5 domains of health-related quality of life (HRQL) and assess its feasibility, reliability, validity, and efficiency. SUMMARY OF BACKGROUND DATA: Kopec and colleagues have recently developed item response theory based item banks for 5 domains of HRQL relevant to back pain and suitable for CAT applications. The domains are Daily Activities (DAILY), Walking (WALK), Handling Objects (HAND), Pain or Discomfort (PAIN), and Feelings (FEEL). METHODS: An adaptive algorithm was implemented in a web-based questionnaire administration system. The questionnaire included CAT-5D-QOL (5 scales), Modified Oswestry Disability Index (MODI), Roland-Morris Disability Questionnaire (RMDQ), SF-36 Health Survey, and standard clinical and demographic information. Participants were outpatients treated for mechanical back pain at a referral center in Vancouver, Canada. RESULTS: A total of 215 patients completed the questionnaire and 84 completed a retest. On average, patients answered 5.2 items per CAT-5D-QOL scale. Reliability ranged from 0.83 (FEEL) to 0.92 (PAIN) and was 0.92 for the MODI, RMDQ, and Physical Component Summary (PCS-36). The ceiling effect was 0.5\% for PAIN compared with 2\% for MODI and 5\% for RMQ. The CAT-5D-QOL scales correlated as anticipated with other measures of HRQL and discriminated well according to the level of satisfaction with current symptoms, duration of the last episode, sciatica, and disability compensation. The average relative discrimination index was 0.87 for PAIN, 0.67 for DAILY and 0.62 for WALK, compared with 0.89 for MODI, 0.80 for RMDQ, and 0.59 for PCS-36. CONCLUSION: The CAT-5D-QOL is feasible, reliable, valid, and efficient in patients with back pain. This methodology can be recommended for use in back pain research and should improve outcome assessment, facilitate comparisons across studies, and reduce patient burden.}, keywords = {*Disability Evaluation, *Health Status Indicators, *Quality of Life, Adult, Aged, Algorithms, Back Pain/*diagnosis/psychology, British Columbia, Diagnosis, Computer-Assisted/*standards, Feasibility Studies, Female, Humans, Internet, Male, Middle Aged, Predictive Value of Tests, Questionnaires/*standards, Reproducibility of Results}, isbn = {1528-1159 (Electronic)0362-2436 (Linking)}, author = {Kopec, J. A. and Badii, M. and McKenna, M. and Lima, V. D. and Sayre, E. C. and Dvorak, M.} } @article {2103, title = {Computerized Adaptive Testing of Personality Traits}, journal = {Zeitschrift f{\"u}r Psychologie / Journal of Psychology}, volume = {216}, year = {2008}, pages = {12-21}, abstract = {

A computerized adaptive testing (CAT) procedure was simulated with ordinal polytomous personality data collected using a
conventional paper-and-pencil testing format. An adapted Dutch version of the dominance scale of Gough and Heilbrun\’s Adjective
Check List (ACL) was used. This version contained Likert response scales with five categories. Item parameters were estimated using Samejima\’s graded response model from the responses of 1,925 subjects. The CAT procedure was simulated using the responses of 1,517 other subjects. The value of the required standard error in the stopping rule of the CAT was manipulated. The relationship between CAT latent trait estimates and estimates based on all dominance items was studied. Additionally, the pattern of relationships between the CAT latent trait estimates and the other ACL scales was compared to that between latent trait estimates based on the entire item pool and the other ACL scales. The CAT procedure resulted in latent trait estimates qualitatively equivalent to latent trait estimates based on all items, while a substantial reduction of the number of used items could be realized (at the stopping rule of 0.4 about 33\% of the 36 items was used).

}, keywords = {Adaptive Testing, cmoputer-assisted testing, Item Response Theory, Likert scales, Personality Measures}, doi = {10.1027/0044-3409.216.1.12}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @article {307, title = {The D-optimality item selection criterion in the early stage of CAT: A study with the graded response model}, journal = {Journal of Educational and Behavioral Statistics}, volume = {33}, number = {1}, year = {2008}, pages = {88-110}, abstract = {During the early stage of computerized adaptive testing (CAT), item selection criteria based on Fisher{\textquoteright}s information often produce less stable latent trait estimates than the Kullback-Leibler global information criterion. Robustness against early stage instability has been reported for the D-optimality criterion in a polytomous CAT with the Nominal Response Model and is shown herein to be reproducible for the Graded Response Model. For comparative purposes, the A-optimality and the global information criteria are also applied. Their item selection is investigated as a function of test progression and item bank composition. The results indicate how the selection of specific item parameters underlies the criteria performances evaluated via accuracy and precision of estimation. In addition, the criteria item exposure rates are compared, without the use of any exposure controlling measure. On the account of stability, precision, accuracy, numerical simplicity, and less evidently, item exposure rate, the D-optimality criterion can be recommended for CAT.}, keywords = {computerized adaptive testing, D optimality, item selection}, author = {Passos, V. L. and Berger, M. P. F. and Tan, F. E. S.} } @article {287, title = {Measuring physical functioning in children with spinal impairments with computerized adaptive testing}, journal = {Journal of Pediatric Orthopedics}, volume = {28}, number = {3}, year = {2008}, note = {Mulcahey, M JHaley, Stephen MDuffy, TheresaPengsheng, NiBetz, Randal RK02 HD045354-01A1/HD/NICHD NIH HHS/United StatesUnited StatesJournal of pediatric orthopedicsJ Pediatr Orthop. 2008 Apr-May;28(3):330-5.}, month = {Apr-May}, pages = {330-5}, edition = {2008/03/26}, abstract = {BACKGROUND: The purpose of this study was to assess the utility of measuring current physical functioning status of children with scoliosis and kyphosis by applying computerized adaptive testing (CAT) methods. Computerized adaptive testing uses a computer interface to administer the most optimal items based on previous responses, reducing the number of items needed to obtain a scoring estimate. METHODS: This was a prospective study of 77 subjects (0.6-19.8 years) who were seen by a spine surgeon during a routine clinic visit for progress spine deformity. Using a multidimensional version of the Pediatric Evaluation of Disability Inventory CAT program (PEDI-MCAT), we evaluated content range, accuracy and efficiency, known-group validity, concurrent validity with the Pediatric Outcomes Data Collection Instrument, and test-retest reliability in a subsample (n = 16) within a 2-week interval. RESULTS: We found the PEDI-MCAT to have sufficient item coverage in both self-care and mobility content for this sample, although most patients tended to score at the higher ends of both scales. Both the accuracy of PEDI-MCAT scores as compared with a fixed format of the PEDI (r = 0.98 for both mobility and self-care) and test-retest reliability were very high [self-care: intraclass correlation (3,1) = 0.98, mobility: intraclass correlation (3,1) = 0.99]. The PEDI-MCAT took an average of 2.9 minutes for the parents to complete. The PEDI-MCAT detected expected differences between patient groups, and scores on the PEDI-MCAT correlated in expected directions with scores from the Pediatric Outcomes Data Collection Instrument domains. CONCLUSIONS: Use of the PEDI-MCAT to assess the physical functioning status, as perceived by parents of children with complex spinal impairments, seems to be feasible and achieves accurate and efficient estimates of self-care and mobility function. Additional item development will be needed at the higher functioning end of the scale to avoid ceiling effects for older children. LEVEL OF EVIDENCE: This is a level II prospective study designed to establish the utility of computer adaptive testing as an evaluation method in a busy pediatric spine practice.}, keywords = {*Disability Evaluation, Adolescent, Child, Child, Preschool, Computer Simulation, Cross-Sectional Studies, Disabled Children/*rehabilitation, Female, Humans, Infant, Kyphosis/*diagnosis/rehabilitation, Male, Prospective Studies, Reproducibility of Results, Scoliosis/*diagnosis/rehabilitation}, isbn = {0271-6798 (Print)0271-6798 (Linking)}, author = {Mulcahey, M. J. and Haley, S. M. and Duffy, T. and Pengsheng, N. and Betz, R. R.} } @article {199, title = {Computerized adaptive testing for polytomous motivation items: Administration mode effects and a comparison with short forms}, journal = {Applied Psychological Measurement}, volume = {31}, number = {5}, year = {2007}, note = {10.1177/0146621606297314Journal; Peer Reviewed Journal; Journal Article}, pages = {412-429}, abstract = {In a randomized experiment (n=515), a computerized and a computerized adaptive test (CAT) are compared. The item pool consists of 24 polytomous motivation items. Although items are carefully selected, calibration data show that Samejima{\textquoteright}s graded response model did not fit the data optimally. A simulation study is done to assess possible consequences of model misfit. CAT efficiency was studied by a systematic comparison of the CAT with two types of conventional fixed length short forms, which are created to be good CAT competitors. Results showed no essential administration mode effects. Efficiency analyses show that CAT outperformed the short forms in almost all aspects when results are aggregated along the latent trait scale. The real and the simulated data results are very similar, which indicate that the real data results are not affected by model misfit. (PsycINFO Database Record (c) 2007 APA ) (journal abstract)}, keywords = {2220 Tests \& Testing, Adaptive Testing, Attitude Measurement, computer adaptive testing, Computer Assisted Testing, items, Motivation, polytomous motivation, Statistical Validity, Test Administration, Test Forms, Test Items}, isbn = {0146-6216}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @article {52, title = {Improving patient reported outcomes using item response theory and computerized adaptive testing}, journal = {Journal of Rheumatology}, volume = {34}, number = {6}, year = {2007}, note = {Chakravarty, Eliza FBjorner, Jakob BFries, James FAr052158/ar/niamsConsensus Development ConferenceResearch Support, N.I.H., ExtramuralCanadaThe Journal of rheumatologyJ Rheumatol. 2007 Jun;34(6):1426-31.}, month = {Jun}, pages = {1426-31}, edition = {2007/06/07}, abstract = {OBJECTIVE: Patient reported outcomes (PRO) are considered central outcome measures for both clinical trials and observational studies in rheumatology. More sophisticated statistical models, including item response theory (IRT) and computerized adaptive testing (CAT), will enable critical evaluation and reconstruction of currently utilized PRO instruments to improve measurement precision while reducing item burden on the individual patient. METHODS: We developed a domain hierarchy encompassing the latent trait of physical function/disability from the more general to most specific. Items collected from 165 English-language instruments were evaluated by a structured process including trained raters, modified Delphi expert consensus, and then patient evaluation. Each item in the refined data bank will undergo extensive analysis using IRT to evaluate response functions and measurement precision. CAT will allow for real-time questionnaires of potentially smaller numbers of questions tailored directly to each individual{\textquoteright}s level of physical function. RESULTS: Physical function/disability domain comprises 4 subdomains: upper extremity, trunk, lower extremity, and complex activities. Expert and patient review led to consensus favoring use of present-tense "capability" questions using a 4- or 5-item Likert response construct over past-tense "performance"items. Floor and ceiling effects, attribution of disability, and standardization of response categories were also addressed. CONCLUSION: By applying statistical techniques of IRT through use of CAT, existing PRO instruments may be improved to reduce questionnaire burden on the individual patients while increasing measurement precision that may ultimately lead to reduced sample size requirements for costly clinical trials.}, keywords = {*Rheumatic Diseases/physiopathology/psychology, Clinical Trials, Data Interpretation, Statistical, Disability Evaluation, Health Surveys, Humans, International Cooperation, Outcome Assessment (Health Care)/*methods, Patient Participation/*methods, Research Design/*trends, Software}, isbn = {0315-162X (Print)}, author = {Chakravarty, E. F. and Bjorner, J. B. and Fries, J.F.} } @article {363, title = {The initial development of an item bank to assess and screen for psychological distress in cancer patients}, journal = {Psycho-Oncology}, volume = {16}, number = {8}, year = {2007}, note = {10.1002/pon.1117Journal; Peer Reviewed Journal; Journal Article}, pages = {724-732}, abstract = {Psychological distress is a common problem among cancer patients. Despite the large number of instruments that have been developed to assess distress, their utility remains disappointing. This study aimed to use Rasch models to develop an item-bank which would provide the basis for better means of assessing psychological distress in cancer patients. An item bank was developed from eight psychological distress questionnaires using Rasch analysis to link common items. Items from the questionnaires were added iteratively with common items as anchor points and misfitting items (infit mean square > 1.3) removed, and unidimensionality assessed. A total of 4914 patients completed the questionnaires providing an initial pool of 83 items. Twenty items were removed resulting in a final pool of 63 items. Good fit was demonstrated and no additional factor structure was evident from the residuals. However, there was little overlap between item locations and person measures, since items mainly targeted higher levels of distress. The Rasch analysis allowed items to be pooled and generated a unidimensional instrument for measuring psychological distress in cancer patients. Additional items are required to more accurately assess patients across the whole continuum of psychological distress. (PsycINFO Database Record (c) 2007 APA ) (journal abstract)}, keywords = {3293 Cancer, cancer patients, Distress, initial development, Item Response Theory, Models, Neoplasms, Patients, Psychological, psychological distress, Rasch, Stress}, isbn = {1057-9249}, author = {Smith, A. B. and Rush, R. and Velikova, G. and Wall, L. and Wright, E. P. and Stark, D. and Selby, P. and Sharpe, M.} } @article {18, title = {Methods for restricting maximum exposure rate in computerized adaptative testing}, journal = {Methodology: European Journal of Research Methods for the Behavioral and Social Sciences}, volume = {3}, number = {1}, year = {2007}, pages = {14-23}, publisher = {Hogrefe \& Huber Publishers GmbH: Germany}, abstract = {The Sympson-Hetter (1985) method provides a means of controlling maximum exposure rate of items in Computerized Adaptive Testing. Through a series of simulations, control parameters are set that mark the probability of administration of an item on being selected. This method presents two main problems: it requires a long computation time for calculating the parameters and the maximum exposure rate is slightly above the fixed limit. Van der Linden (2003) presented two alternatives which appear to solve both of the problems. The impact of these methods in the measurement accuracy has not been tested yet. We show how these methods over-restrict the exposure of some highly discriminating items and, thus, the accuracy is decreased. It also shown that, when the desired maximum exposure rate is near the minimum possible value, these methods offer an empirical maximum exposure rate clearly above the goal. A new method, based on the initial estimation of the probability of administration and the probability of selection of the items with the restricted method (Revuelta \& Ponsoda, 1998), is presented in this paper. It can be used with the Sympson-Hetter method and with the two van der Linden{\textquoteright}s methods. This option, when used with Sympson-Hetter, speeds the convergence of the control parameters without decreasing the accuracy. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item bank security, item exposure control, overlap rate, Sympson-Hetter method}, isbn = {1614-1881 (Print); 1614-2241 (Electronic)}, author = {Barrada, J and Olea, J. and Ponsoda, V.} } @article {343, title = {Psychometric properties of an emotional adjustment measure: An application of the graded response model}, journal = {European Journal of Psychological Assessment}, volume = {23}, number = {1}, year = {2007}, pages = {39-46}, publisher = {Hogrefe \& Huber Publishers GmbH: Germany}, abstract = {Item response theory (IRT) provides valuable methods for the analysis of the psychometric properties of a psychological measure. However, IRT has been mainly used for assessing achievements and ability rather than personality factors. This paper presents an application of the IRT to a personality measure. Thus, the psychometric properties of a new emotional adjustment measure that consists of a 28-six graded response items is shown. Classical test theory (CTT) analyses as well as IRT analyses are carried out. Samejima{\textquoteright}s (1969) graded-response model has been used for estimating item parameters. Results show that the bank of items fulfills model assumptions and fits the data reasonably well, demonstrating the suitability of the IRT models for the description and use of data originating from personality measures. In this sense, the model fulfills the expectations that IRT has undoubted advantages: (1) The invariance of the estimated parameters, (2) the treatment given to the standard error of measurement, and (3) the possibilities offered for the construction of computerized adaptive tests (CAT). The bank of items shows good reliability. It also shows convergent validity compared to the Eysenck Personality Inventory (EPQ-A; Eysenck \& Eysenck, 1975) and the Big Five Questionnaire (BFQ; Caprara, Barbaranelli, \& Borgogni, 1993). (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive tests, Emotional Adjustment, Item Response Theory, Personality Measures, personnel recruitment, Psychometrics, Samejima{\textquoteright}s graded response model, test reliability, validity}, isbn = {1015-5759 (Print)}, author = {Rubio, V. J. and Aguado, D. and Hontangas, P. M. and Hern{\'a}ndez, J. M.} } @article {401, title = {Assembling a computerized adaptive testing item pool as a set of linear tests}, journal = {Journal of Educational and Behavioral Statistics}, volume = {31}, number = {1}, year = {2006}, pages = {81-99}, publisher = {Sage Publications: US}, abstract = {Test-item writing efforts typically results in item pools with an undesirable correlational structure between the content attributes of the items and their statistical information. If such pools are used in computerized adaptive testing (CAT), the algorithm may be forced to select items with less than optimal information, that violate the content constraints, and/or have unfavorable exposure rates. Although at first sight somewhat counterintuitive, it is shown that if the CAT pool is assembled as a set of linear test forms, undesirable correlations can be broken down effectively. It is proposed to assemble such pools using a mixed integer programming model with constraints that guarantee that each test meets all content specifications and an objective function that requires them to have maximal information at a well-chosen set of ability values. An empirical example with a previous master pool from the Law School Admission Test (LSAT) yielded a CAT with nearly uniform bias and mean-squared error functions for the ability estimator and item-exposure rates that satisfied the target for all items in the pool. }, keywords = {Algorithms, computerized adaptive testing, item pool, linear tests, mathematical models, statistics, Test Construction, Test Items}, isbn = {1076-9986 (Print)}, author = {van der Linden, W. J. and Ariel, A. and Veldkamp, B. P.} } @article {247, title = {Comparing methods of assessing differential item functioning in a computerized adaptive testing environment}, journal = {Journal of Educational Measurement}, volume = {43}, number = {3}, year = {2006}, pages = {245-264}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Mantel-Haenszel and SIBTEST, which have known difficulty in detecting non-unidirectional differential item functioning (DIF), have been adapted with some success for computerized adaptive testing (CAT). This study adapts logistic regression (LR) and the item-response-theory-likelihood-ratio test (IRT-LRT), capable of detecting both unidirectional and non-unidirectional DIF, to the CAT environment in which pretest items are assumed to be seeded in CATs but not used for trait estimation. The proposed adaptation methods were evaluated with simulated data under different sample size ratios and impact conditions in terms of Type I error, power, and specificity in identifying the form of DIF. The adapted LR and IRT-LRT procedures are more powerful than the CAT version of SIBTEST for non-unidirectional DIF detection. The good Type I error control provided by IRT-LRT under extremely unequal sample sizes and large impact is encouraging. Implications of these and other findings are discussed. all rights reserved)}, keywords = {computerized adaptive testing, educational testing, item response theory likelihood ratio test, logistic regression, trait estimation, unidirectional \& non-unidirectional differential item functioning}, isbn = {0022-0655 (Print)}, author = {Lei, P-W. and Chen, S-Y. and Yu, L.} } @article {164, title = {The comparison among item selection strategies of CAT with multiple-choice items}, journal = {Acta Psychologica Sinica}, volume = {38}, number = {5}, year = {2006}, pages = {778-783}, publisher = {Science Press: China}, abstract = {The initial purpose of comparing item selection strategies for CAT was to increase the efficiency of tests. As studies continued, however, it was found that increasing the efficiency of item bank using was also an important goal of comparing item selection strategies. These two goals often conflicted. The key solution was to find a strategy with which both goals could be accomplished. The item selection strategies for graded response model in this study included: the average of the difficulty orders matching with the ability; the medium of the difficulty orders matching with the ability; maximum information; A stratified (average); and A stratified (medium). The evaluation indexes used for comparison included: the bias of ability estimates for the true; the standard error of ability estimates; the average items which the examinees have administered; the standard deviation of the frequency of items selected; and sum of the indices weighted. Using the Monte Carlo simulation method, we obtained some data and computer iterated the data 20 times each under the conditions that the item difficulty parameters followed the normal distribution and even distribution. The results were as follows; The results indicated that no matter difficulty parameters followed the normal distribution or even distribution. Every type of item selection strategies designed in this research had its strong and weak points. In general evaluation, under the condition that items were stratified appropriately, A stratified (medium) (ASM) had the best effect. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {CAT, computerized adaptive testing, graded response model, item selection strategies, multiple choice items}, isbn = {0439-755X (Print)}, author = {Hai-qi, D. and De-zhi, C. and Shuliang, D. and Taiping, D.} } @article {314, title = {[Item Selection Strategies of Computerized Adaptive Testing based on Graded Response Model.]}, journal = {Acta Psychologica Sinica}, volume = {38}, number = {3}, year = {2006}, pages = {461-467}, publisher = {Science Press: China}, abstract = {Item selection strategy (ISS) is an important component of Computerized Adaptive Testing (CAT). Its performance directly affects the security, efficiency and precision of the test. Thus, ISS becomes one of the central issues in CATs based on the Graded Response Model (GRM). It is well known that the goal of IIS is to administer the next unused item remaining in the item bank that best fits the examinees current ability estimate. In dichotomous IRT models, every item has only one difficulty parameter and the item whose difficulty matches the examinee{\textquoteright}s current ability estimate is considered to be the best fitting item. However, in GRM, each item has more than two ordered categories and has no single value to represent the item difficulty. Consequently, some researchers have used to employ the average or the median difficulty value across categories as the difficulty estimate for the item. Using the average value and the median value in effect introduced two corresponding ISSs. In this study, we used computer simulation compare four ISSs based on GRM. We also discussed the effect of "shadow pool" on the uniformity of pool usage as well as the influence of different item parameter distributions and different ability estimation methods on the evaluation criteria of CAT. In the simulation process, Monte Carlo method was adopted to simulate the entire CAT process; 1,000 examinees drawn from standard normal distribution and four 1,000-sized item pools of different item parameter distributions were also simulated. The assumption of the simulation is that a polytomous item is comprised of six ordered categories. In addition, ability estimates were derived using two methods. They were expected a posteriori Bayesian (EAP) and maximum likelihood estimation (MLE). In MLE, the Newton-Raphson iteration method and the Fisher Score iteration method were employed, respectively, to solve the likelihood equation. Moreover, the CAT process was simulated with each examinee 30 times to eliminate random error. The IISs were evaluated by four indices usually used in CAT from four aspects--the accuracy of ability estimation, the stability of IIS, the usage of item pool, and the test efficiency. Simulation results showed adequate evaluation of the ISS that matched the estimate of an examinee{\textquoteright}s current trait level with the difficulty values across categories. Setting "shadow pool" in ISS was able to improve the uniformity of pool utilization. Finally, different distributions of the item parameter and different ability estimation methods affected the evaluation indices of CAT. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item selection strategy}, isbn = {0439-755X (Print)}, author = {Ping, Chen and Shuliang, Ding and Haijing, Lin and Jie, Zhou} } @article {174, title = {Measurement precision and efficiency of multidimensional computer adaptive testing of physical functioning using the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {9}, year = {2006}, note = {Haley, Stephen MNi, PengshengLudlow, Larry HFragala-Pinkham, Maria AK02 hd45354-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, Non-U.S. Gov{\textquoteright}tUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Sep;87(9):1223-9.}, month = {Sep}, pages = {1223-9}, edition = {2006/08/29}, abstract = {OBJECTIVE: To compare the measurement efficiency and precision of a multidimensional computer adaptive testing (M-CAT) application to a unidimensional CAT (U-CAT) comparison using item bank data from 2 of the functional skills scales of the Pediatric Evaluation of Disability Inventory (PEDI). DESIGN: Using existing PEDI mobility and self-care item banks, we compared the stability of item calibrations and model fit between unidimensional and multidimensional Rasch models and compared the efficiency and precision of the U-CAT- and M-CAT-simulated assessments to a random draw of items. SETTING: Pediatric rehabilitation hospital and clinics. PARTICIPANTS: Clinical and normative samples. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Not applicable. RESULTS: The M-CAT had greater levels of precision and efficiency than the separate mobility and self-care U-CAT versions when using a similar number of items for each PEDI subdomain. Equivalent estimation of mobility and self-care scores can be achieved with a 25\% to 40\% item reduction with the M-CAT compared with the U-CAT. CONCLUSIONS: M-CAT applications appear to have both precision and efficiency advantages compared with separate U-CAT assessments when content subdomains have a high correlation. Practitioners may also realize interpretive advantages of reporting test score information for each subdomain when separate clinical inferences are desired.}, keywords = {*Disability Evaluation, *Pediatrics, Adolescent, Child, Child, Preschool, Computers, Disabled Persons/*classification/rehabilitation, Efficiency, Humans, Infant, Outcome Assessment (Health Care), Psychometrics, Self Care}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Ni, P. and Ludlow, L. H. and Fragala-Pinkham, M. A.} } @article {116, title = {Optimal testing with easy or difficult items in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {30}, number = {5}, year = {2006}, pages = {379-393}, publisher = {Sage Publications: US}, abstract = {Computerized adaptive tests (CATs) are individualized tests that, from a measurement point of view, are optimal for each individual, possibly under some practical conditions. In the present study, it is shown that maximum information item selection in CATs using an item bank that is calibrated with the one- or the two-parameter logistic model results in each individual answering about 50\% of the items correctly. Two item selection procedures giving easier (or more difficult) tests for students are presented and evaluated. Item selection on probability points of items yields good results only with the one-parameter logistic model and not with the two-parameter logistic model. An alternative selection procedure, based on maximum information at a shifted ability level, gives satisfactory results with both models. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive tests, individualized tests, Item Response Theory, item selection, Measurement}, isbn = {0146-6216 (Print)}, author = {Theo Eggen and Verschoor, Angela J.} } @article {2073, title = {Simulated computerized adaptive test for patients with lumbar spine impairments was efficient and produced valid measures of function}, journal = {Journal of Clinical Epidemiology}, volume = {59}, year = {2006}, pages = {947{\textendash}956}, abstract = {Objective: To equate physical functioning (PF) items with Back Pain Functional Scale (BPFS) items, develop a computerized adaptive test (CAT) designed to assess lumbar spine functional status (LFS) in people with lumbar spine impairments, and compare discriminant validity of LFS measures (qIRT) generated using all items analyzed with a rating scale Item Response Theory model (RSM) and measures generated using the simulated CAT (qCAT). Methods: We performed a secondary analysis of retrospective intake rehabilitation data. Results: Unidimensionality and local independence of 25 BPFS and PF items were supported. Differential item functioning was negligible for levels of symptom acuity, gender, age, and surgical history. The RSM fit the data well. A lumbar spine specific CAT was developed that was 72\% more efficient than using all 25 items to estimate LFS measures. qIRT and qCAT measures did not discriminate patients by symptom acuity, age, or gender, but discriminated patients by surgical history in similar clinically logical ways. qCAT measures were as precise as qIRT measures. Conclusion: A body part specific simulated CAT developed from an LFS item bank was efficient and produced precise measures of LFS without eroding discriminant validity.}, keywords = {Back Pain Functional Scale, computerized adaptive testing, Item Response Theory, Lumbar spine, Rehabilitation, True-score equating}, doi = {10.1016/j.jclinepi.2005.10.017}, author = {Hart, D. L. and Mioduski, J. E. and Werneke, M. W. and Stratford, P. W.} } @article {2074, title = {Simulated computerized adaptive test for patients with shoulder impairments was efficient and produced valid measures of function}, journal = {Journal of Clinical Epidemiology}, volume = {59}, year = {2006}, pages = {290-298}, abstract = {

Background and Objective: To test unidimensionality and local independence of a set of shoulder functional status (SFS) items,
develop a computerized adaptive test (CAT) of the items using a rating scale item response theory model (RSM), and compare discriminant validity of measures generated using all items (qIRT) and measures generated using the simulated CAT (qCAT).
Study Design and Setting: We performed a secondary analysis of data collected prospectively during rehabilitation of 400 patients
with shoulder impairments who completed 60 SFS items.
Results: Factor analytic techniques supported that the 42 SFS items formed a unidimensional scale and were locally independent. Except for five items, which were deleted, the RSM fit the data well. The remaining 37 SFS items were used to generate the CAT. On average, 6 items on were needed to estimate precise measures of function using the SFS CAT, compared with all 37 SFS items. The qIRT and qCAT measures were highly correlated (r 5 .96) and resulted in similar classifications of patients.
Conclusion: The simulated SFS CAT was efficient and produced precise, clinically relevant measures of functional status with good
discriminating ability.\ 

}, keywords = {computerized adaptive testing, Flexilevel Scale of Shoulder Function, Item Response Theory, Rehabilitation}, author = {Hart, D. L. and Cook, K. F. and Mioduski, J. E. and Teal, C. R. and Crane, P. K.} } @article {296, title = {T{\'e}cnicas para detectar patrones de respuesta at{\'\i}picos [Aberrant patterns detection methods]}, journal = {Anales de Psicolog{\'\i}a}, volume = {22}, number = {1}, year = {2006}, note = {Spain: Universidad de Murcia}, pages = {143-154}, abstract = {La identificaci{\'o}n de patrones de respuesta at{\'\i}picos es de gran utilidad para la construcci{\'o}n de tests y de bancos de {\'\i}tems con propiedades psicom{\'e}tricas as{\'\i} como para el an{\'a}lisis de validez de los mismos. En este trabajo de revisi{\'o}n se han recogido los m{\'a}s relevantes y novedosos m{\'e}todos de ajuste de personas que se han elaborado dentro de cada uno de los principales {\'a}mbitos de trabajo de la Psicometr{\'\i}a: el escalograma de Guttman, la Teor{\'\i}a Cl{\'a}sica de Tests (TCT), la Teor{\'\i}a de la Generalizabilidad (TG), la Teor{\'\i}a de Respuesta al {\'I}tem (TRI), los Modelos de Respuesta al {\'I}tem No Param{\'e}tricos (MRINP), los Modelos de Clase Latente de Orden Restringido (MCL-OR) y el An{\'a}lisis de Estructura de Covarianzas (AEC).Aberrant patterns detection has a great usefulness in order to make tests and item banks with psychometric characteristics and validity analysis of tests and items. The most relevant and newest person-fit methods have been reviewed. All of them have been made in each one of main areas of Psychometry: Guttman{\textquoteright}s scalogram, Classical Test Theory (CTT), Generalizability Theory (GT), Item Response Theory (IRT), Non-parametric Response Models (NPRM), Order-Restricted Latent Class Models (OR-LCM) and Covariance Structure Analysis (CSA).}, keywords = {aberrant patterns detection, Classical Test Theory, generalizability theory, Item Response, Item Response Theory, Mathematics, methods, person-fit, Psychometrics, psychometry, Test Validity, test validity analysis, Theory}, isbn = {0212-9728}, author = {N{\'u}{\~n}ez, R. M. N. and Pina, J. A. L.} } @inbook {180, title = {Applications of item response theory to improve health outcomes assessment: Developing item banks, linking instruments, and computer-adaptive testing}, booktitle = {Outcomes assessment in cancer}, year = {2005}, note = {Using Smart Source ParsingOutcomes assessment in cancer: Measures, methods, and applications. (pp. 445-464). New York, NY : Cambridge University Press. xiv, 662 pp}, pages = {445-464}, publisher = {Cambridge University Press}, organization = {Cambridge University Press}, address = {Cambridge, UK}, abstract = {(From the chapter) The current chapter builds on Reise{\textquoteright}s introduction to the basic concepts, assumptions, popular models, and important features of IRT and discusses the applications of item response theory (IRT) modeling to health outcomes assessment. In particular, we highlight the critical role of IRT modeling in: developing an instrument to match a study{\textquoteright}s population; linking two or more instruments measuring similar constructs on a common metric; and creating item banks that provide the foundation for tailored short-form instruments or for computerized adaptive assessments. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, Health, Item Response Theory, Measurement, Test Construction, Treatment Outcomes}, author = {Hambleton, R. K.}, editor = {C. C. Gotay and C. Snyder} } @article {175, title = {Assessing mobility in children using a computer adaptive testing version of the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {86}, number = {5}, year = {2005}, note = {Haley, Stephen MRaczek, Anastasia ECoster, Wendy JDumas, Helene MFragala-Pinkham, Maria AK02 hd45354-01a1/hd/nichdR43 hd42388-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, U.S. Gov{\textquoteright}t, P.H.S.United StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2005 May;86(5):932-9.}, month = {May}, pages = {932-9}, edition = {2005/05/17}, abstract = {OBJECTIVE: To assess score agreement, validity, precision, and response burden of a prototype computerized adaptive testing (CAT) version of the Mobility Functional Skills Scale (Mob-CAT) of the Pediatric Evaluation of Disability Inventory (PEDI) as compared with the full 59-item version (Mob-59). DESIGN: Computer simulation analysis of cross-sectional and longitudinal retrospective data; and cross-sectional prospective study. SETTING: Pediatric rehabilitation hospital, including inpatient acute rehabilitation, day school program, outpatient clinics, community-based day care, preschool, and children{\textquoteright}s homes. PARTICIPANTS: Four hundred sixty-nine children with disabilities and 412 children with no disabilities (analytic sample); 41 children without disabilities and 39 with disabilities (cross-validation sample). INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from a prototype Mob-CAT application and versions using 15-, 10-, and 5-item stopping rules; scores from the Mob-59; and number of items and time (in seconds) to administer assessments. RESULTS: Mob-CAT scores from both computer simulations (intraclass correlation coefficient [ICC] range, .94-.99) and field administrations (ICC=.98) were in high agreement with scores from the Mob-59. Using computer simulations of retrospective data, discriminant validity, and sensitivity to change of the Mob-CAT closely approximated that of the Mob-59, especially when using the 15- and 10-item stopping rule versions of the Mob-CAT. The Mob-CAT used no more than 15\% of the items for any single administration, and required 20\% of the time needed to administer the Mob-59. CONCLUSIONS: Comparable score estimates for the PEDI mobility scale can be obtained from CAT administrations, with losses in validity and precision for shorter forms, but with a considerable reduction in administration time.}, keywords = {*Computer Simulation, *Disability Evaluation, Adolescent, Child, Child, Preschool, Cross-Sectional Studies, Disabled Children/*rehabilitation, Female, Humans, Infant, Male, Outcome Assessment (Health Care)/*methods, Rehabilitation Centers, Rehabilitation/*standards, Sensitivity and Specificity}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Raczek, A. E. and Coster, W. J. and Dumas, H. M. and Fragala-Pinkham, M. A.} } @article {2095, title = {An Authoring Environment for Adaptive Testing}, journal = {Educational Technology \& Society}, volume = {8}, year = {2005}, pages = {66-76}, abstract = {

SIETTE is a web-based adaptive testing system. It implements Computerized Adaptive Tests. These tests are tailor-made, theory-based tests, where questions shown to students, finalization of the test, and student knowledge estimation is accomplished adaptively. To construct these tests, SIETTE has an authoring environment comprising a suite of tools that helps teachers create questions and tests properly, and analyze students\’ performance after taking a test. In this paper, we present this authoring environment in the
framework of adaptive testing. As will be shown, this set of visual tools, that contain some adaptable eatures, can be useful for teachers lacking skills in this kind of testing. Additionally, other systems that implement adaptive testing will be studied.\ 

}, keywords = {Adaptability, Adaptive Testing, Authoring environment, Item Response Theory}, author = {Guzm{\'a}n, E and Conejo, R and Garc{\'\i}a-Herv{\'a}s, E} } @article {102, title = {A Bayesian student model without hidden nodes and its comparison with item response theory}, journal = {International Journal of Artificial Intelligence in Education}, volume = {15}, number = {4}, year = {2005}, pages = {291-323}, publisher = {IOS Press: Netherlands}, abstract = {The Bayesian framework offers a number of techniques for inferring an individual{\textquoteright}s knowledge state from evidence of mastery of concepts or skills. A typical application where such a technique can be useful is Computer Adaptive Testing (CAT). A Bayesian modeling scheme, POKS, is proposed and compared to the traditional Item Response Theory (IRT), which has been the prevalent CAT approach for the last three decades. POKS is based on the theory of knowledge spaces and constructs item-to-item graph structures without hidden nodes. It aims to offer an effective knowledge assessment method with an efficient algorithm for learning the graph structure from data. We review the different Bayesian approaches to modeling student ability assessment and discuss how POKS relates to them. The performance of POKS is compared to the IRT two parameter logistic model. Experimental results over a 34 item Unix test and a 160 item French language test show that both approaches can classify examinees as master or non-master effectively and efficiently, with relatively comparable performance. However, more significant differences are found in favor of POKS for a second task that consists in predicting individual question item outcome. Implications of these results for adaptive testing and student modeling are discussed, as well as the limitations and advantages of POKS, namely the issue of integrating concepts into its structure. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {Bayesian Student Model, computer adaptive testing, hidden nodes, Item Response Theory}, isbn = {1560-4292 (Print); 1560-4306 (Electronic)}, author = {Desmarais, M. C. and Pu, X.} } @article {398, title = {A comparison of item-selection methods for adaptive tests with content constraints}, journal = {Journal of Educational Measurement}, volume = {42}, number = {3}, year = {2005}, pages = {283-302}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {In test assembly, a fundamental difference exists between algorithms that select a test sequentially or simultaneously. Sequential assembly allows us to optimize an objective function at the examinee{\textquoteright}s ability estimate, such as the test information function in computerized adaptive testing. But it leads to the non-trivial problem of how to realize a set of content constraints on the test{\textemdash}a problem more naturally solved by a simultaneous item-selection method. Three main item-selection methods in adaptive testing offer solutions to this dilemma. The spiraling method moves item selection across categories of items in the pool proportionally to the numbers needed from them. Item selection by the weighted-deviations method (WDM) and the shadow test approach (STA) is based on projections of the future consequences of selecting an item. These two methods differ in that the former calculates a projection of a weighted sum of the attributes of the eventual test and the latter a projection of the test itself. The pros and cons of these methods are analyzed. An empirical comparison between the WDM and STA was conducted for an adaptive version of the Law School Admission Test (LSAT), which showed equally good item-exposure rates but violations of some of the constraints and larger bias and inaccuracy of the ability estimator for the WDM.}, keywords = {Adaptive Testing, Algorithms, content constraints, item selection method, shadow test approach, spiraling method, weighted deviations method}, isbn = {0022-0655 (Print)}, author = {van der Linden, W. J.} } @article {171, title = {A computer adaptive testing approach for assessing physical functioning in children and adolescents}, journal = {Developmental Medicine and Child Neuropsychology}, volume = {47}, number = {2}, year = {2005}, note = {Haley, Stephen MNi, PengshengFragala-Pinkham, Maria ASkrinar, Alison MCorzo, DeyaniraComparative StudyResearch Support, Non-U.S. Gov{\textquoteright}tEnglandDevelopmental medicine and child neurologyDev Med Child Neurol. 2005 Feb;47(2):113-20.}, month = {Feb}, pages = {113-120}, edition = {2005/02/15}, abstract = {The purpose of this article is to demonstrate: (1) the accuracy and (2) the reduction in amount of time and effort in assessing physical functioning (self-care and mobility domains) of children and adolescents using computer-adaptive testing (CAT). A CAT algorithm selects questions directly tailored to the child{\textquoteright}s ability level, based on previous responses. Using a CAT algorithm, a simulation study was used to determine the number of items necessary to approximate the score of a full-length assessment. We built simulated CAT (5-, 10-, 15-, and 20-item versions) for self-care and mobility domains and tested their accuracy in a normative sample (n=373; 190 males, 183 females; mean age 6y 11mo [SD 4y 2m], range 4mo to 14y 11mo) and a sample of children and adolescents with Pompe disease (n=26; 21 males, 5 females; mean age 6y 1mo [SD 3y 10mo], range 5mo to 14y 10mo). Results indicated that comparable score estimates (based on computer simulations) to the full-length tests can be achieved in a 20-item CAT version for all age ranges and for normative and clinical samples. No more than 13 to 16\% of the items in the full-length tests were needed for any one administration. These results support further consideration of using CAT programs for accurate and efficient clinical assessments of physical functioning.}, keywords = {*Computer Systems, Activities of Daily Living, Adolescent, Age Factors, Child, Child Development/*physiology, Child, Preschool, Computer Simulation, Confidence Intervals, Demography, Female, Glycogen Storage Disease Type II/physiopathology, Health Status Indicators, Humans, Infant, Infant, Newborn, Male, Motor Activity/*physiology, Outcome Assessment (Health Care)/*methods, Reproducibility of Results, Self Care, Sensitivity and Specificity}, isbn = {0012-1622 (Print)}, author = {Haley, S. M. and Ni, P. and Fragala-Pinkham, M. A. and Skrinar, A. M. and Corzo, D.} } @article {72, title = {Controlling item exposure and test overlap in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {29}, number = {3}, year = {2005}, pages = {204-217}, abstract = {This article proposes an item exposure control method, which is the extension of the Sympson and Hetter procedure and can provide item exposure control at both the item and test levels. Item exposure rate and test overlap rate are two indices commonly used to track item exposure in computerized adaptive tests. By considering both indices, item exposure can be monitored at both the item and test levels. To control the item exposure rate and test overlap rate simultaneously, the modified procedure attempted to control not only the maximum value but also the variance of item exposure rates. Results indicated that the item exposure rate and test overlap rate could be controlled simultaneously by implementing the modified procedure. Item exposure control was improved and precision of trait estimation decreased when a prespecified maximum test overlap rate was stringent. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Content (Test) computerized adaptive testing}, author = {Chen, S-Y. and Lei, P-W.} } @article {121, title = {Data pooling and analysis to build a preliminary item bank: an example using bowel function in prostate cancer}, journal = {Evaluation and the Health Professions}, volume = {28}, number = {2}, year = {2005}, note = {0163-2787 (Print)Journal Article}, pages = {142-59}, abstract = {Assessing bowel function (BF) in prostate cancer can help determine therapeutic trade-offs. We determined the components of BF commonly assessed in prostate cancer studies as an initial step in creating an item bank for clinical and research application. We analyzed six archived data sets representing 4,246 men with prostate cancer. Thirty-one items from validated instruments were available for analysis. Items were classified into domains (diarrhea, rectal urgency, pain, bleeding, bother/distress, and other) then subjected to conventional psychometric and item response theory (IRT) analyses. Items fit the IRT model if the ratio between observed and expected item variance was between 0.60 and 1.40. Four of 31 items had inadequate fit in at least one analysis. Poorly fitting items included bleeding (2), rectal urgency (1), and bother/distress (1). A fifth item assessing hemorrhoids was poorly correlated with other items. Our analyses supported four related components of BF: diarrhea, rectal urgency, pain, and bother/distress.}, keywords = {*Quality of Life, *Questionnaires, Adult, Aged, Data Collection/methods, Humans, Intestine, Large/*physiopathology, Male, Middle Aged, Prostatic Neoplasms/*physiopathology, Psychometrics, Research Support, Non-U.S. Gov{\textquoteright}t, Statistics, Nonparametric}, author = {Eton, D. T. and Lai, J. S. and Cella, D. and Reeve, B. B. and Talcott, J. A. and Clark, J. A. and McPherson, C. P. and Litwin, M. S. and Moinpour, C. M.} } @article {85, title = {Dynamic assessment of health outcomes: Time to let the CAT out of the bag?}, journal = {Health Services Research}, volume = {40}, number = {5, part2}, year = {2005}, pages = {1694-1711}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Background: The use of item response theory (IRT) to measure self-reported outcomes has burgeoned in recent years. Perhaps the most important application of IRT is computer-adaptive testing (CAT), a measurement approach in which the selection of items is tailored for each respondent. Objective. To provide an introduction to the use of CAT in the measurement of health outcomes, describe several IRT models that can be used as the basis of CAT, and discuss practical issues associated with the use of adaptive scaling in research settings. Principal Points: The development of a CAT requires several steps that are not required in the development of a traditional measure including identification of "starting" and "stopping" rules. CAT{\textquoteright}s most attractive advantage is its efficiency. Greater measurement precision can be achieved with fewer items. Disadvantages of CAT include the high cost and level of technical expertise required to develop a CAT. Conclusions: Researchers, clinicians, and patients benefit from the availability of psychometrically rigorous measures that are not burdensome. CAT outcome measures hold substantial promise in this regard, but their development is not without challenges. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive testing, Item Response Theory, self reported health outcomes}, isbn = {0017-9124 (Print); 1475-6773 (Electronic)}, author = {Cook, K. F. and O{\textquoteright}Malley, K. J. and Roddey, T. S.} } @article {253, title = {Increasing the homogeneity of CAT{\textquoteright}s item-exposure rates by minimizing or maximizing varied target functions while assembling shadow tests}, journal = {Journal of Educational Measurement}, volume = {42}, number = {3}, year = {2005}, pages = {245-269}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {A computerized adaptive testing (CAT) algorithm that has the potential to increase the homogeneity of CATs item-exposure rates without significantly sacrificing the precision of ability estimates was proposed and assessed in the shadow-test (van der Linden \& Reese, 1998) CAT context. This CAT algorithm was formed by a combination of maximizing or minimizing varied target functions while assembling shadow tests. There were four target functions to be separately used in the first, second, third, and fourth quarter test of CAT. The elements to be used in the four functions were associated with (a) a random number assigned to each item, (b) the absolute difference between an examinee{\textquoteright}s current ability estimate and an item difficulty, (c) the absolute difference between an examinee{\textquoteright}s current ability estimate and an optimum item difficulty, and (d) item information. The results indicated that this combined CAT fully utilized all the items in the pool, reduced the maximum exposure rates, and achieved more homogeneous exposure rates. Moreover, its precision in recovering ability estimates was similar to that of the maximum item-information method. The combined CAT method resulted in the best overall results compared with the other individual CAT item-selection methods. The findings from the combined CAT are encouraging. Future uses are discussed. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {algorithm, computerized adaptive testing, item exposure rate, shadow test, varied target function}, isbn = {0022-0655 (Print)}, author = {Li, Y. H. and Schafer, W. D.} } @article {208, title = {Infeasibility in automated test assembly models: A comparison study of different methods}, journal = {Journal of Educational Measurement}, volume = {42}, number = {3}, year = {2005}, pages = {223-243}, abstract = {Several techniques exist to automatically put together a test meeting a number of specifications. In an item bank, the items are stored with their characteristics. A test is constructed by selecting a set of items that fulfills the specifications set by the test assembler. Test assembly problems are often formulated in terms of a model consisting of restrictions and an objective to be maximized or minimized. A problem arises when it is impossible to construct a test from the item pool that meets all specifications, that is, when the model is not feasible. Several methods exist to handle these infeasibility problems. In this article, test assembly models resulting from two practical testing programs were reconstructed to be infeasible. These models were analyzed using methods that forced a solution (Goal Programming, Multiple-Goal Programming, Greedy Heuristic), that analyzed the causes (Relaxed and Ordered Deletion Algorithm (RODA), Integer Randomized Deletion Algorithm (IRDA), Set Covering (SC), and Item Sampling), or that analyzed the causes and used this information to force a solution (Irreducible Infeasible Set-Solver). Specialized methods such as the IRDA and the Irreducible Infeasible Set-Solver performed best. Recommendations about the use of different methods are given. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Algorithms, Item Content (Test), Models, Test Construction}, author = {Huitzing, H. A. and Veldkamp, B. P. and Verschoor, A. J.} } @article {218, title = {[Item characteristic curve equating under graded response models in IRT]}, journal = {Acta Psychologica Sinica}, volume = {37}, number = {6}, year = {2005}, pages = {832-838}, publisher = {Science Press: China}, abstract = {In one of the largest qualificatory tests--economist test, to guarantee the comparability among different years, construct item bank and prepare for computerized adaptive testing, item characteristic curve equating and anchor test equating design under graded models in IRT are used, which have realized the item and ability parameter equating of test data in five years and succeeded in establishing an item bank. Based on it, cut scores of different years are compared by equating and provide demonstrational gist to constitute the eligibility standard of economist test. }, keywords = {graded response models, item characteristic curve, Item Response Theory}, isbn = {0439-755X (Print)}, author = {Jun, Z. and Dongming, O. and Shuyuan, X. and Haiqi, D. and Shuqing, Q.} } @article {4, title = {Propiedades psicom{\'e}tricas de un test Adaptativo Informatizado para la medici{\'o}n del ajuste emocional [Psychometric properties of an Emotional Adjustment Computerized Adaptive Test]}, journal = {Psicothema}, volume = {17}, number = {3}, year = {2005}, pages = {484-491}, abstract = {En el presente trabajo se describen las propiedades psicom{\'e}tricas de un Test Adaptativo Informatizado para la medici{\'o}n del ajuste emocional de las personas. La revisi{\'o}n de la literatura acerca de la aplicaci{\'o}n de los modelos de la teor{\'\i}a de la respuesta a los {\'\i}tems (TRI) muestra que {\'e}sta se ha utilizado m{\'a}s en el trabajo con variables aptitudinales que para la medici{\'o}n de variables de personalidad, sin embargo diversos estudios han mostrado la eficacia de la TRI para la descripci{\'o}n psicom{\'e}trica de dichasvariables. Aun as{\'\i}, pocos trabajos han explorado las caracter{\'\i}sticas de un Test Adaptativo Informatizado, basado en la TRI, para la medici{\'o}n de una variable de personalidad como es el ajuste emocional. Nuestros resultados muestran la eficiencia del TAI para la evaluaci{\'o}n del ajuste emocional, proporcionando una medici{\'o}n v{\'a}lida y precisa, utilizando menor n{\'u}mero de elementos de medida encomparaci{\'o}n con las escalas de ajuste emocional de instrumentos fuertemente implantados. Psychometric properties of an emotional adjustment computerized adaptive test. In the present work it was described the psychometric properties of an emotional adjustment computerized adaptive test. An examination of Item Response Theory (IRT) research literature indicates that IRT has been mainly used for assessing achievements and ability rather than personality factors. Nevertheless last years have shown several studies wich have successfully used IRT to personality assessment instruments. Even so, a few amount of works has inquired the computerized adaptative test features, based on IRT, for the measurement of a personality traits as it{\textquoteright}s the emotional adjustment. Our results show the CAT efficiency for the emotional adjustment assessment so this provides a valid and accurate measurement; by using a less number of items in comparison with the emotional adjustment scales from the most strongly established questionnaires.}, keywords = {Computer Assisted Testing, Emotional Adjustment, Item Response, Personality Measures, Psychometrics, Test Validity, Theory}, author = {Aguado, D. and Rubio, V. J. and Hontangas, P. M. and Hern{\'a}ndez, J. M.} } @article {282, title = {Somministrazione di test computerizzati di tipo adattivo: Un{\textquoteright} applicazione del modello di misurazione di Rasch [Administration of computerized and adaptive tests: An application of the Rasch Model]}, journal = {Testing Psicometria Metodologia}, volume = {12}, number = {3}, year = {2005}, pages = {131-149}, abstract = {The aim of the present study is to describe the characteristics of a procedure for administering computerized and adaptive tests (Computer Adaptive Testing or CAT). Items to be asked to the individuals are interactively chosen and are selected from a "bank" in which they were previously calibrated and recorded on the basis of their difficulty level. The selection of items is performed by increasingly more accurate estimates of the examinees{\textquoteright} ability. The building of an item-bank on Psychometrics and the implementation of this procedure allow a first validation through Monte Carlo simulations. (PsycINFO Database Record (c) 2006 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Response Theory computerized adaptive testing, Models, Psychometrics}, author = {Miceli, R. and Molinengo, G.} } @article {195, title = {Test construction for cognitive diagnosis}, journal = {Applied Psychological Measurement}, volume = {29}, number = {4}, year = {2005}, pages = {262-277}, abstract = {Although cognitive diagnostic models (CDMs) can be useful in the analysis and interpretation of existing tests, little has been developed to specify how one might construct a good test using aspects of the CDMs. This article discusses the derivation of a general CDM index based on Kullback-Leibler information that will serve as a measure of how informative an item is for the classification of examinees. The effectiveness of the index is examined for items calibrated using the deterministic input noisy "and" gate model (DINA) and the reparameterized unified model (RUM) by implementing a simple heuristic to construct a test from an item bank. When compared to randomly constructed tests from the same item bank, the heuristic shows significant improvement in classification rates. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {(Measurement), Cognitive Assessment, Item Analysis (Statistical), Profiles, Test Construction, Test Interpretation, Test Items}, author = {Henson, R. K. and Douglas, J.} } @inbook {322, title = {Adaptive computerized educational systems: A case study}, booktitle = {Evidence-based educational methods}, series = {Educational Psychology Series}, year = {2004}, note = {Using Smart Source ParsingEvidence-based educational methods. A volume in the educational psychology series. (pp. 143-170). San Diego, CA : Elsevier Academic Press, [URL:http://www.academicpress.com]. xxiv, 382 pp}, pages = {143-169}, publisher = {Elsevier Academic Press}, organization = {Elsevier Academic Press}, chapter = {10}, address = {San Diego, CA. USA}, abstract = {(Created by APA) Adaptive instruction describes adjustments typical of one-on-one tutoring as discussed in the college tutorial scenario. So computerized adaptive instruction refers to the use of computer software--almost always incorporating artificially intelligent services--which has been designed to adjust both the presentation of information and the form of questioning to meet the current needs of an individual learner. This chapter describes a system for Internet-delivered adaptive instruction. The author attempts to demonstrate a sharp difference between the teaching that takes place outside of the classroom in universities and the kind that is at least afforded, if not taken advantage of by many, students in a more personalized educational setting such as those in the small liberal arts colleges. The author describes a computer-based technology that allows that gap to be bridged with the advantage of at least having more highly prepared learners sitting in college classrooms. A limited range of emerging research that supports that proposition is cited. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Artificial, Computer Assisted Instruction, Computer Software, Higher Education, Individualized, Instruction, Intelligence, Internet, Undergraduate Education}, author = {Ray, R. D.}, editor = {R. W. Malott} } @article {147, title = {Computerized adaptive measurement of depression: A simulation study}, journal = {BMC Psychiatry}, volume = {4}, number = {1}, year = {2004}, pages = {13-23}, abstract = {Background: Efficient, accurate instruments for measuring depression are increasingly importantin clinical practice. We developed a computerized adaptive version of the Beck DepressionInventory (BDI). We examined its efficiency and its usefulness in identifying Major DepressiveEpisodes (MDE) and in measuring depression severity.Methods: Subjects were 744 participants in research studies in which each subject completed boththe BDI and the SCID. In addition, 285 patients completed the Hamilton Depression Rating Scale.Results: The adaptive BDI had an AUC as an indicator of a SCID diagnosis of MDE of 88\%,equivalent to the full BDI. The adaptive BDI asked fewer questions than the full BDI (5.6 versus 21items). The adaptive latent depression score correlated r = .92 with the BDI total score and thelatent depression score correlated more highly with the Hamilton (r = .74) than the BDI total scoredid (r = .70).Conclusions: Adaptive testing for depression may provide greatly increased efficiency withoutloss of accuracy in identifying MDE or in measuring depression severity.}, keywords = {*Computer Simulation, Adult, Algorithms, Area Under Curve, Comparative Study, Depressive Disorder/*diagnosis/epidemiology/psychology, Diagnosis, Computer-Assisted/*methods/statistics \& numerical data, Factor Analysis, Statistical, Female, Humans, Internet, Male, Mass Screening/methods, Patient Selection, Personality Inventory/*statistics \& numerical data, Pilot Projects, Prevalence, Psychiatric Status Rating Scales/*statistics \& numerical data, Psychometrics, Research Support, Non-U.S. Gov{\textquoteright}t, Research Support, U.S. Gov{\textquoteright}t, P.H.S., Severity of Illness Index, Software}, author = {Gardner, W. and Shear, K. and Kelleher, K. J. and Pajer, K. A. and Mammen, O. and Buysse, D. and Frank, E.} } @article {44, title = {Computers in clinical assessment: Historical developments, present status, and future challenges}, journal = {Journal of Clinical Psychology}, volume = {60}, number = {3}, year = {2004}, pages = {331-345}, publisher = {John Wiley \& Sons: US}, abstract = {Computerized testing methods have long been regarded as a potentially powerful asset for providing psychological assessment services. Ever since computers were first introduced and adapted to the field of assessment psychology in the 1950s, they have been a valuable aid for scoring, data processing, and even interpretation of test results. The history and status of computer-based personality and neuropsychological tests are discussed in this article. Several pertinent issues involved in providing test interpretation by computer are highlighted. Advances in computer-based test use, such as computerized adaptive testing, are described and problems noted. Today, there is great interest in expanding the availability of psychological assessment applications on the Internet. Although these applications show great promise, there are a number of problems associated with providing psychological tests on the Internet that need to be addressed by psychologists before the Internet can become a major medium for psychological service delivery. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {clinical assessment, computerized testing method, Internet, psychological assessment services}, isbn = {0021-9762 (Print); 1097-4679 (Electronic)}, author = {Butcher, J. N. and Perry, J. L. and Hahn, J. A.} } @article {408, title = {Constraining item exposure in computerized adaptive testing with shadow tests}, journal = {Journal of Educational and Behavioral Statistics}, volume = {29}, number = {3}, year = {2004}, pages = {273-291}, publisher = {American Educational Research Assn: US}, abstract = {Item-exposure control in computerized adaptive testing is implemented by imposing item-ineligibility constraints on the assembly process of the shadow tests. The method resembles Sympson and Hetter{\textquoteright}s (1985) method of item-exposure control in that the decisions to impose the constraints are probabilistic. The method does not, however, require time-consuming simulation studies to set values for control parameters before the operational use of the test. Instead, it can set the probabilities of item ineligibility adaptively during the test using the actual item-exposure rates. An empirical study using an item pool from the Law School Admission Test showed that application of the method yielded perfect control of the item-exposure rates and had negligible impact on the bias and mean-squared error functions of the ability estimator. }, keywords = {computerized adaptive testing, item exposure control, item ineligibility constraints, Probability, shadow tests}, isbn = {1076-9986 (Print)}, author = {van der Linden, W. J. and Veldkamp, B. P.} } @article {10, title = {Constructing rotating item pools for constrained adaptive testing}, journal = {Journal of Educational Measurement}, volume = {41}, number = {4}, year = {2004}, pages = {345-359}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Preventing items in adaptive testing from being over- or underexposed is one of the main problems in computerized adaptive testing. Though the problem of overexposed items can be solved using a probabilistic item-exposure control method, such methods are unable to deal with the problem of underexposed items. Using a system of rotating item pools, on the other hand, is a method that potentially solves both problems. In this method, a master pool is divided into (possibly overlapping) smaller item pools, which are required to have similar distributions of content and statistical attributes. These pools are rotated among the testing sites to realize desirable exposure rates for the items. A test assembly model, motivated by Gulliksen{\textquoteright}s matched random subtests method, was explored to help solve the problem of dividing a master pool into a set of smaller pools. Different methods to solve the model are proposed. An item pool from the Law School Admission Test was used to evaluate the performances of computerized adaptive tests from systems of rotating item pools constructed using these methods. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive tests, constrained adaptive testing, item exposure, rotating item pools}, isbn = {0022-0655 (Print)}, author = {Ariel, A. and Veldkamp, B. P. and van der Linden, W. J.} } @article {69, title = {Effects of practical constraints on item selection rules at the early stages of computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {41}, number = {2}, year = {2004}, pages = {149-174}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {The purpose of this study was to compare the effects of four item selection rules--(1) Fisher information (F), (2) Fisher information with a posterior distribution (FP), (3) Kullback-Leibler information with a posterior distribution (KP), and (4) completely randomized item selection (RN)--with respect to the precision of trait estimation and the extent of item usage at the early stages of computerized adaptive testing. The comparison of the four item selection rules was carried out under three conditions: (1) using only the item information function as the item selection criterion; (2) using both the item information function and content balancing; and (3) using the item information function, content balancing, and item exposure control. When test length was less than 10 items, FP and KP tended to outperform F at extreme trait levels in Condition 1. However, in more realistic settings, it could not be concluded that FP and KP outperformed F, especially when item exposure control was imposed. When test length was greater than 10 items, the three nonrandom item selection procedures performed similarly no matter what the condition was, while F had slightly higher item usage. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item selection rules, practical constraints}, isbn = {0022-0655 (Print)}, author = {Chen, S-Y. and Ankenmann, R. D.} } @article {332, title = {Estimating ability and item-selection strategy in self-adapted testing: A latent class approach}, journal = {Journal of Educational and Behavioral Statistics}, volume = {29}, number = {4}, year = {2004}, pages = {379-396}, publisher = {American Educational Research Assn: US}, abstract = {This article presents a psychometric model for estimating ability and item-selection strategies in self-adapted testing. In contrast to computer adaptive testing, in self-adapted testing the examinees are allowed to select the difficulty of the items. The item-selection strategy is defined as the distribution of difficulty conditional on the responses given to previous items. The article shows that missing responses in self-adapted testing are missing at random and can be ignored in the estimation of ability. However, the item-selection strategy cannot always be ignored in such an estimation. An EM algorithm is presented to estimate an examinee{\textquoteright}s ability and strategies, and a model fit is evaluated using Akaike{\textquoteright}s information criterion. The article includes an application with real data to illustrate how the model can be used in practice for evaluating hypotheses, estimating ability, and identifying strategies. In the example, four strategies were identified and related to examinees{\textquoteright} ability. It was shown that individual examinees tended not to follow a consistent strategy throughout the test. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {estimating ability, item-selection strategies, psychometric model, self-adapted testing}, isbn = {1076-9986 (Print)}, author = {Revuelta, J.} } @article {93, title = {Strategies for controlling item exposure in computerized adaptive testing with the generalized partial credit model}, journal = {Applied Psychological Measurement}, volume = {28}, number = {3}, year = {2004}, pages = {165-185}, publisher = {Sage Publications: US}, abstract = {Choosing a strategy for controlling item exposure has become an integral part of test development for computerized adaptive testing (CAT). This study investigated the performance of six procedures for controlling item exposure in a series of simulated CATs under the generalized partial credit model. In addition to a no-exposure control baseline condition, the randomesque, modified-within-.10-logits, Sympson-Hetter, conditional Sympson-Hetter, a-stratified with multiple-stratification, and enhanced a-stratified with multiple-stratification procedures were implemented to control exposure rates. Two variations of the randomesque and modified-within-.10-logits procedures were examined, which varied the size of the item group from which the next item to be administered was randomly selected. The results indicate that although the conditional Sympson-Hetter provides somewhat lower maximum exposure rates, the randomesque and modified-within-.10-logits procedures with the six-item group variation have great utility for controlling overlap rates and increasing pool utilization and should be given further consideration. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, generalized partial credit model, item exposure}, isbn = {0146-6216 (Print)}, author = {Davis, L. L.} } @article {278, title = {Using patterns of summed scores in paper-and-pencil tests and computer-adaptive tests to detect misfitting item score patterns}, journal = {Journal of Educational Measurement}, volume = {41}, number = {2}, year = {2004}, pages = {119-136}, abstract = {Two new methods have been proposed to determine unexpected sum scores on subtests (testlets) both for paper-and-pencil tests and computer adaptive tests. A method based on a conservative bound using the hypergeometric distribution, denoted ρ, was compared with a method where the probability for each score combination was calculated using a highest density region (HDR). Furthermore, these methods were compared with the standardized log-likelihood statistic with and without a correction for the estimated latent trait value (denoted as l-super(*)-sub(z) and l-sub(z), respectively). Data were simulated on the basis of the one-parameter logistic model, and both parametric and nonparametric logistic regression was used to obtain estimates of the latent trait. Results showed that it is important to take the trait level into account when comparing subtest scores. In a nonparametric item response theory (IRT) context, on adapted version of the HDR method was a powerful alterative to ρ. In a parametric IRT context, results showed that l-super(*)-sub(z) had the highest power when the data were simulated conditionally on the estimated latent trait level. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Computer Assisted Testing, Item Response Theory, person Fit, Test Scores}, author = {Meijer, R. R.} } @article {275, title = {A Bayesian method for the detection of item preknowledge in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {27}, number = {2}, year = {2003}, pages = {121-137}, abstract = {With the increased use of continuous testing in computerized adaptive testing, new concerns about test security have evolved, such as how to ensure that items in an item pool are safeguarded from theft. In this article, procedures to detect test takers using item preknowledge are explored. When test takers use item preknowledge, their item responses deviate from the underlying item response theory (IRT) model, and estimated abilities may be inflated. This deviation may be detected through the use of person-fit indices. A Bayesian posterior log odds ratio index is proposed for detecting the use of item preknowledge. In this approach to person fit, the estimated probability that each test taker has preknowledge of items is updated after each item response. These probabilities are based on the IRT parameters, a model specifying the probability that each item has been memorized, and the test taker{\textquoteright}s item responses. Simulations based on an operational computerized adaptive test (CAT) pool are used to demonstrate the use of the odds ratio index. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Cheating, Computer Assisted Testing, Individual Differences computerized adaptive testing, Item, Item Analysis (Statistical), Mathematical Modeling, Response Theory}, author = {McLeod, L. and Lewis, C. and Thissen, D.} } @article {63, title = {A comparative study of item exposure control methods in computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {40}, number = {1}, year = {2003}, pages = {71-103}, abstract = {This study compared the properties of five methods of item exposure control within the purview of estimating examinees{\textquoteright} abilities in a computerized adaptive testing (CAT) context. Each exposure control algorithm was incorporated into the item selection procedure and the adaptive testing progressed based on the CAT design established for this study. The merits and shortcomings of these strategies were considered under different item pool sizes and different desired maximum exposure rates and were evaluated in light of the observed maximum exposure rates, the test overlap rates, and the conditional standard errors of measurement. Each method had its advantages and disadvantages, but no one possessed all of the desired characteristics. There was a clear and logical trade-off between item exposure control and measurement precision. The M. L. Stocking and C. Lewis conditional multinomial procedure and, to a slightly lesser extent, the T. Davey and C. G. Parshall method seemed to be the most promising considering all of the factors that this study addressed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Educational, Item Analysis (Statistical), Measurement, Strategies computerized adaptive testing}, author = {Chang, S-W. and Ansley, T. N.} } @article {349, title = {Computerized adaptive rating scales for measuring managerial performance}, journal = {International Journal of Selection and Assessment}, volume = {11}, number = {2-3}, year = {2003}, pages = {237-246}, abstract = {Computerized adaptive rating scales (CARS) had been developed to measure contextual or citizenship performance. This rating format used a paired-comparison protocol, presenting pairs of behavioral statements scaled according to effectiveness levels, and an iterative item response theory algorithm to obtain estimates of ratees{\textquoteright} citizenship performance (W. C. Borman et al, 2001). In the present research, we developed CARS to measure the entire managerial performance domain, including task and citizenship performance, thus addressing a major limitation of the earlier CARS. The paper describes this development effort, including an adjustment to the algorithm that reduces substantially the number of item pairs required to obtain almost as much precision in the performance estimates. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Algorithms, Associations, Citizenship, Computer Assisted Testing, Construction, Contextual, Item Response Theory, Job Performance, Management, Management Personnel, Rating Scales, Test}, author = {Schneider, R. J. and Goff, M. and Anderson, S. and Borman, W. C.} } @article {75, title = {Computerized adaptive testing using the nearest-neighbors criterion}, journal = {Applied Psychological Measurement}, volume = {27}, number = {3}, year = {2003}, pages = {204-216}, abstract = {Item selection procedures designed for computerized adaptive testing need to accurately estimate every taker{\textquoteright}s trait level (θ) and, at the same time, effectively use all items in a bank. Empirical studies showed that classical item selection procedures based on maximizing Fisher or other related information yielded highly varied item exposure rates; with these procedures, some items were frequently used whereas others were rarely selected. In the literature, methods have been proposed for controlling exposure rates; they tend to affect the accuracy in θ estimates, however. A modified version of the maximum Fisher information (MFI) criterion, coined the nearest neighbors (NN) criterion, is proposed in this study. The NN procedure improves to a moderate extent the undesirable item exposure rates associated with the MFI criterion and keeps sufficient precision in estimates. The NN criterion will be compared with a few other existing methods in an empirical study using the mean squared errors in θ estimates and plots of item exposure rates associated with different distributions. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {(Statistical), Adaptive Testing, Computer Assisted Testing, Item Analysis, Item Response Theory, Statistical Analysis, Statistical Estimation computerized adaptive testing, Statistical Tests}, author = {Cheng, P. E. and Liou, M.} } @article {94, title = {Item exposure constraints for testlets in the verbal reasoning section of the MCAT}, journal = {Applied Psychological Measurement}, volume = {27}, number = {5}, year = {2003}, pages = {335-356}, abstract = {The current study examined item exposure control procedures for testlet scored reading passages in the Verbal Reasoning section of the Medical College Admission Test with four computerized adaptive testing (CAT) systems using the partial credit model. The first system used a traditional CAT using maximum information item selection. The second used random item selection to provide a baseline for optimal exposure rates. The third used a variation of Lunz and Stahl{\textquoteright}s randomization procedure. The fourth used Luecht and Nungester{\textquoteright}s computerized adaptive sequential testing (CAST) system. A series of simulated fixed-length CATs was run to determine the optimal item length selection procedure. Results indicated that both the randomization procedure and CAST performed well in terms of exposure control and measurement precision, with the CAST system providing the best overall solution when all variables were taken into consideration. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Entrance Examinations, Item Response Theory, Random Sampling, Reasoning, Verbal Ability computerized adaptive testing}, author = {Davis, L. L. and Dodd, B. G.} } @article {57, title = {Optimal stratification of item pools in α-stratified computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {27}, number = {4}, year = {2003}, pages = {262-274}, abstract = {A method based on 0-1 linear programming (LP) is presented to stratify an item pool optimally for use in α-stratified adaptive testing. Because the 0-1 LP model belongs to the subclass of models with a network flow structure, efficient solutions are possible. The method is applied to a previous item pool from the computerized adaptive testing (CAT) version of the Graduate Record Exams (GRE) Quantitative Test. The results indicate that the new method performs well in practical situations. It improves item exposure control, reduces the mean squared error in the θ estimates, and increases test reliability. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Content (Test), Item Response Theory, Mathematical Modeling, Test Construction computerized adaptive testing}, author = {Chang, Hua-Hua and van der Linden, W. J.} } @article {68, title = {The relationship between item exposure and test overlap in computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {40}, number = {2}, year = {2003}, pages = {129-145}, abstract = {The purpose of this article is to present an analytical derivation for the mathematical form of an average between-test overlap index as a function of the item exposure index, for fixed-length computerized adaptive tests (CATs). This algebraic relationship is used to investigate the simultaneous control of item exposure at both the item and test levels. The results indicate that, in fixed-length CATs, control of the average between-test overlap is achieved via the mean and variance of the item exposure rates of the items that constitute the CAT item pool. The mean of the item exposure rates is easily manipulated. Control over the variance of the item exposure rates can be achieved via the maximum item exposure rate (r-sub(max)). Therefore, item exposure control methods which implement a specification of r-sub(max) (e.g., J. B. Sympson and R. D. Hetter, 1985) provide the most direct control at both the item and test levels. (PsycINFO Database Record (c) 2005 APA )}, keywords = {(Statistical), Adaptive Testing, Computer Assisted Testing, Human Computer, Interaction computerized adaptive testing, Item Analysis, Item Analysis (Test), Test Items}, author = {Chen, S-Y. and Ankemann, R. D. and Spray, J. A.} } @article {321, title = {Timing behavior in computerized adaptive testing: Response times for correct and incorrect answers are not related to general fluid intelligence/Zum Zeitverhalten beim computergest{\"u}tzten adaptiveb Testen: Antwortlatenzen bei richtigen und falschen L{\"o}sun}, journal = {Zeitschrift f{\"u}r Differentielle und Diagnostische Psychologie}, volume = {24}, number = {1}, year = {2003}, pages = {57-63}, abstract = {Examined the effects of general fluid intelligence on item response times for correct and false responses in computerized adaptive testing. After performing the CFT3 intelligence test, 80 individuals (aged 17-44 yrs) completed perceptual and cognitive discrimination tasks. Results show that response times were related neither to the proficiency dimension reflected by the task nor to the individual level of fluid intelligence. Furthermore, the false > correct-phenomenon as well as substantial positive correlations between item response times for false and correct responses were shown to be independent of intelligence levels. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Cognitive Ability, Intelligence, Perception, Reaction Time computerized adaptive testing}, author = {Rammsayer, Thomas and Brandler, Susanne} } @article {308, title = {A comparison of item selection techniques and exposure control mechanisms in CATs using the generalized partial credit model}, journal = {Applied Psychological Measurement}, volume = {26}, number = {2}, year = {2002}, pages = {147-163}, abstract = {The use of more performance items in large-scale testing has led to an increase in the research investigating the use of polytomously scored items in computer adaptive testing (CAT). Because this research has to be complemented with information pertaining to exposure control, the present research investigated the impact of using five different exposure control algorithms in two sized item pools calibrated using the generalized partial credit model. The results of the simulation study indicated that the a-stratified design, in comparison to a no-exposure control condition, could be used to reduce item exposure and overlap, increase pool utilization, and only minorly degrade measurement precision. Use of the more restrictive exposure control algorithms, such as the Sympson-Hetter and conditional Sympson-Hetter, controlled exposure to a greater extent but at the cost of measurement precision. Because convergence of the exposure control parameters was problematic for some of the more restrictive exposure control algorithms, use of the more simplistic exposure control mechanisms, particularly when the test length to item pool size ratio is large, is recommended. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {(Statistical), Adaptive Testing, Algorithms computerized adaptive testing, Computer Assisted Testing, Item Analysis, Item Response Theory, Mathematical Modeling}, author = {Pastor, D. A. and Dodd, B. G. and Chang, Hua-Hua} } @article {14, title = {Data sparseness and on-line pretest item calibration-scaling methods in CAT}, journal = {Journal of Educational Measurement}, volume = {39}, number = {3}, year = {2002}, pages = {207-218}, abstract = {Compared and evaluated 3 on-line pretest item calibration-scaling methods (the marginal maximum likelihood estimate with 1 expectation maximization [EM] cycle [OEM] method, the marginal maximum likelihood estimate with multiple EM cycles [MEM] method, and M. L. Stocking{\textquoteright}s Method B) in terms of item parameter recovery when the item responses to the pretest items in the pool are sparse. Simulations of computerized adaptive tests were used to evaluate the results yielded by the three methods. The MEM method produced the smallest average total error in parameter estimation, and the OEM method yielded the largest total error (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, Educational Measurement, Item Response Theory, Maximum Likelihood, Methodology, Scaling (Testing), Statistical Data}, author = {Ban, J-C. and Hanson, B. A. and Yi, Q. and Harris, D. J.} } @article {370, title = {An EM approach to parameter estimation for the Zinnes and Griggs paired comparison IRT model}, journal = {Applied Psychological Measurement}, volume = {26}, number = {2}, year = {2002}, pages = {208-227}, abstract = {Borman et al. recently proposed a computer adaptive performance appraisal system called CARS II that utilizes paired comparison judgments of behavioral stimuli. To implement this approach,the paired comparison ideal point model developed by Zinnes and Griggs was selected. In this article,the authors describe item response and information functions for the Zinnes and Griggs model and present procedures for estimating stimulus and person parameters. Monte carlo simulations were conducted to assess the accuracy of the parameter estimation procedures. The results indicated that at least 400 ratees (i.e.,ratings) are required to obtain reasonably accurate estimates of the stimulus parameters and their standard errors. In addition,latent trait estimation improves as test length increases. The implications of these results for test construction are also discussed. }, keywords = {Adaptive Testing, Computer Assisted Testing, Item Response Theory, Maximum Likelihood, Personnel Evaluation, Statistical Correlation, Statistical Estimation}, author = {Stark, S. and F Drasgow} } @article {12, title = {Information technology and literacy assessment}, journal = {Reading and Writing Quarterly}, volume = {18}, number = {4}, year = {2002}, pages = {369-373}, abstract = {This column discusses information technology and literacy assessment in the past and present. The author also describes computer-based assessments today including the following topics: computer-scored testing, computer-administered formal assessment, Internet formal assessment, computerized adaptive tests, placement tests, informal assessment, electronic portfolios, information management, and Internet information dissemination. A model of the major present-day applications of information technologies in reading and literacy assessment is also included. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Applications, Computer Assisted Testing, Information, Internet, Literacy, Models, Systems, Technology}, author = {Balajthy, E.} } @article {418, title = {Mathematical-programming approaches to test item pool design}, number = {RR 02-09}, year = {2002}, note = {Using Smart Source ParsingAdvances in psychology research, Vol. ( Hauppauge, NY : Nova Science Publishers, Inc, [URL:http://www.Novapublishers.com]. vi, 228 pp}, pages = {93-108}, institution = {University of Twente, Faculty of Educational Science and Technology}, address = {Twente, The Netherlands}, abstract = {(From the chapter) This paper presents an approach to item pool design that has the potential to improve on the quality of current item pools in educational and psychological testing and hence to increase both measurement precision and validity. The approach consists of the application of mathematical programming techniques to calculate optimal blueprints for item pools. These blueprints can be used to guide the item-writing process. Three different types of design problems are discussed, namely for item pools for linear tests, item pools computerized adaptive testing (CAT), and systems of rotating item pools for CAT. The paper concludes with an empirical example of the problem of designing a system of rotating item pools for CAT.}, keywords = {Adaptive Testing, Computer Assisted, Computer Programming, Educational Measurement, Item Response Theory, Mathematics, Psychometrics, Statistical Rotation computerized adaptive testing, Test Items, Testing}, isbn = {02-09}, author = {Veldkamp, B. P. and van der Linden, W. J. and Ariel, A.} } @article {277, title = {Outlier detection in high-stakes certification testing}, journal = {Journal of Educational Measurement}, volume = {39}, number = {3}, year = {2002}, pages = {219-233}, abstract = {Discusses recent developments of person-fit analysis in computerized adaptive testing (CAT). Methods from statistical process control are presented that have been proposed to classify an item score pattern as fitting or misfitting the underlying item response theory model in CAT Most person-fit research in CAT is restricted to simulated data. In this study, empirical data from a certification test were used. Alternatives are discussed to generate norms so that bounds can be determined to classify an item score pattern as fitting or misfitting. Using bounds determined from a sample of a high-stakes certification test, the empirical analysis showed that different types of misfit can be distinguished Further applications using statistical process control methods to detect misfitting item score patterns are discussed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, computerized adaptive testing, Educational Measurement, Goodness of Fit, Item Analysis (Statistical), Item Response Theory, person Fit, Statistical Estimation, Statistical Power, Test Scores}, author = {Meijer, R. R.} } @article {346, title = {A structure-based approach to psychological measurement: Matching measurement models to latent structure}, journal = {Assessment}, volume = {9}, number = {1}, year = {2002}, pages = {4-16}, abstract = {The present article sets forth the argument that psychological assessment should be based on a construct{\textquoteright}s latent structure. The authors differentiate dimensional (continuous) and taxonic (categorical) structures at the latent and manifest levels and describe the advantages of matching the assessment approach to the latent structure of a construct. A proper match will decrease measurement error, increase statistical power, clarify statistical relationships, and facilitate the location of an efficient cutting score when applicable. Thus, individuals will be placed along a continuum or assigned to classes more accurately. The authors briefly review the methods by which latent structure can be determined and outline a structure-based approach to assessment that builds on dimensional scaling models, such as item response theory, while incorporating classification methods as appropriate. Finally, the authors empirically demonstrate the utility of their approach and discuss its compatibility with traditional assessment methods and with computerized adaptive testing. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Assessment, Classification (Cognitive Process), Computer Assisted, Item Response Theory, Psychological, Scaling (Testing), Statistical Analysis computerized adaptive testing, Taxonomies, Testing}, author = {Ruscio, John and Ruscio, Ayelet Meron} } @article {336, title = {Computerized adaptive testing with the generalized graded unfolding model}, journal = {Applied Psychological Measurement}, volume = {25}, number = {2}, year = {2001}, pages = {177-196}, abstract = {Examined the use of the generalized graded unfolding model (GGUM) in computerized adaptive testing. The objective was to minimize the number of items required to produce equiprecise estimates of person locations. Simulations based on real data about college student attitudes toward abortion and on data generated to fit the GGUM were used. It was found that as few as 7 or 8 items were needed to produce accurate and precise person estimates using an expected a posteriori procedure. The number items in the item bank (20, 40, or 60 items) and their distribution on the continuum (uniform locations or item clusters in moderately extreme locations) had only small effects on the accuracy and precision of the estimates. These results suggest that adaptive testing with the GGUM is a good method for achieving estimates with an approximately uniform level of precision using a small number of items. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Attitude Measurement, College Students computerized adaptive testing, Computer Assisted Testing, Item Response, Models, Statistical Estimation, Theory}, author = {Roberts, J. S. and Lin, Y. and Laughlin, J. E.} } @article {358, title = {Developments in measurement of persons and items by means of item response models}, journal = {Behaviormetrika}, volume = {28}, number = {1}, year = {2001}, pages = {65-94}, abstract = {This paper starts with a general introduction into measurement of hypothetical constructs typical of the social and behavioral sciences. After the stages ranging from theory through operationalization and item domain to preliminary test or questionnaire have been treated, the general assumptions of item response theory are discussed. The family of parametric item response models for dichotomous items is introduced and it is explained how parameters for respondents and items are estimated from the scores collected from a sample of respondents who took the test or questionnaire. Next, the family of nonparametric item response models is explained, followed by the 3 classes of item response models for polytomous item scores (e.g., rating scale scores). Then, to what degree the mean item score and the unweighted sum of item scores for persons are useful for measuring items and persons in the context of item response theory is discussed. Methods for fitting parametric and nonparametric models to data are briefly discussed. Finally, the main applications of item response models are discussed, which include equating and item banking, computerized and adaptive testing, research into differential item functioning, person fit research, and cognitive modeling. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Cognitive, Computer Assisted Testing, Item Response Theory, Models, Nonparametric Statistical Tests, Processes}, author = {Sijtsma, K.} } @inbook {385, title = {Item response theory applied to combinations of multiple-choice and constructed-response items--approximation methods for scale scores}, booktitle = {Test scoring}, year = {2001}, note = {Using Smart Source ParsingTest scoring. (pp. 293-341). Mahwah, NJ : Lawrence Erlbaum Associates, Publishers. xii, 422 pp}, pages = {289-315}, publisher = {Lawrence Erlbaum Associates}, organization = {Lawrence Erlbaum Associates}, chapter = {8}, address = {Mahwah, N.J. USA}, abstract = {(From the chapter) The authors develop approximate methods that replace the scoring tables with weighted linear combinations of the component scores. Topics discussed include: a linear approximation for the extension to combinations of scores; the generalization of two or more scores; potential applications of linear approximations to item response theory in computerized adaptive tests; and evaluation of the pattern-of-summed-scores, and Gaussian approximation, estimates of proficiency. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Item Response Theory, Method), Multiple Choice (Testing, Scoring (Testing), Statistical Estimation, Statistical Weighting, Test Items, Test Scores}, author = {Thissen, D. and Nelson, L. A. and Swygert, K. A.} } @article {70, title = {A comparison of item selection rules at the early stages of computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {24}, number = {3}, year = {2000}, pages = {241-255}, abstract = {The effects of 5 item selection rules--Fisher information (FI), Fisher interval information (FII), Fisher information with a posterior distribution (FIP), Kullback-Leibler information (KL), and Kullback-Leibler information with a posterior distribution (KLP)--were compared with respect to the efficiency and precision of trait (θ) estimation at the early stages of computerized adaptive testing (CAT). FII, FIP, KL, and KLP performed marginally better than FI at the early stages of CAT for θ=-3 and -2. For tests longer than 10 items, there appeared to be no precision advantage for any of the selection rules. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Analysis (Test), Statistical Estimation computerized adaptive testing}, author = {Chen, S-Y. and Ankenmann, R. D. and Chang, Hua-Hua} } @article {179, title = {Emergence of item response modeling in instrument development and data analysis}, journal = {Medical Care}, volume = {38}, number = {Suppl. 9}, year = {2000}, pages = {II60-II65}, keywords = {Computer Assisted Testing, Health, Item Response Theory, Measurement, Statistical Validity computerized adaptive testing, Test Construction, Treatment Outcomes}, author = {Hambleton, R. K.} } @article {74, title = {Estimation of trait level in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {24}, number = {3}, year = {2000}, pages = {257-265}, abstract = {Notes that in computerized adaptive testing (CAT), a examinee{\textquoteright}s trait level (θ) must be estimated with reasonable accuracy based on a small number of item responses. A successful implementation of CAT depends on (1) the accuracy of statistical methods used for estimating θ and (2) the efficiency of the item-selection criterion. Methods of estimating θ suitable for CAT are reviewed, and the differences between Fisher and Kullback-Leibler information criteria for selecting items are discussed. The accuracy of different CAT algorithms was examined in an empirical study. The results show that correcting θ estimates for bias was necessary at earlier stages of CAT, but most CAT algorithms performed equally well for tests of 10 or more items. (PsycINFO Database Record (c) 2005 APA )}, keywords = {(Statistical), Adaptive Testing, Computer Assisted Testing, Item Analysis, Statistical Estimation computerized adaptive testing}, author = {Cheng, P. E. and Liou, M.} } @article {364, title = {An exploratory analysis of item parameters and characteristics that influence item level response time}, journal = {Dissertation Abstracts International Section A: Humanities and Social Sciences}, volume = {61}, number = {5-A}, year = {2000}, pages = {1812}, abstract = {This research examines the relationship between item level response time and (1) item discrimination, (2) item difficulty, (3) word count, (4) item type, and (5) whether a figure is included in an item. Data are from the Graduate Management Admission Test, which is currently offered only as a computerized adaptive test. Analyses revealed significant differences in response time between the five item types: problem solving, data sufficiency, sentence correction, critical reasoning, and reading comprehension. For this reason, the planned pairwise and complex analyses were run within each item type. Pairwise curvilinear regression analyses explored the relationship between response time and item discrimination, item difficulty, and word count. Item difficulty significantly contributed to the prediction of response time for each item type; two of the relationships were significantly quadratic. Item discrimination significantly contributed to the prediction of response time for only two of the item types; one revealed a quadratic relationship and the other a cubic relationship. Word count had significant linear relationship with response time for all the item types except reading comprehension, for which there was no significant relationship. Multiple regression analyses using word count, item difficulty, and item discrimination predicted between 35.4\% and 71.4\% of the variability in item response time across item types. The results suggest that response time research should consider the type of item that is being administered and continue to explore curvilinear relationships between response time and its predictor variables. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Item Analysis (Statistical), Item Response Theory, Problem Solving, Reaction Time, Reading Comprehension, Reasoning}, author = {Smith, Russell Winsor} } @article {409, title = {An integer programming approach to item bank design}, journal = {Applied Psychological Measurement}, volume = {24}, number = {2}, year = {2000}, pages = {139-150}, abstract = {An integer programming approach to item bank design is presented that can be used to calculate an optimal blueprint for an item bank, in order to support an existing testing program. The results are optimal in that they minimize the effort involved in producing the items as revealed by current item writing patterns. Also presented is an adaptation of the models, which can be used as a set of monitoring tools in item bank management. The approach is demonstrated empirically for an item bank that was designed for the Law School Admission Test. }, keywords = {Aptitude Measures, Item Analysis (Test), Item Response Theory, Test Construction, Test Items}, author = {van der Linden, W. J. and Veldkamp, B. P. and Reese, L. M.} } @article {234, title = {Evaluating the usefulness of computerized adaptive testing for medical in-course assessment}, journal = {Academic Medicine}, volume = {74}, number = {10}, year = {1999}, note = {Kreiter, C DFerguson, KGruppen, L DUnited statesAcademic medicine : journal of the Association of American Medical CollegesAcad Med. 1999 Oct;74(10):1125-8.}, month = {Oct}, pages = {1125-8}, edition = {1999/10/28}, abstract = {PURPOSE: This study investigated the feasibility of converting an existing computer-administered, in-course internal medicine test to an adaptive format. METHOD: A 200-item internal medicine extended matching test was used for this research. Parameters were estimated with commercially available software with responses from 621 examinees. A specially developed simulation program was used to retrospectively estimate the efficiency of the computer-adaptive exam format. RESULTS: It was found that the average test length could be shortened by almost half with measurement precision approximately equal to that of the full 200-item paper-and-pencil test. However, computer-adaptive testing with this item bank provided little advantage for examinees at the upper end of the ability continuum. An examination of classical item statistics and IRT item statistics suggested that adding more difficult items might extend the advantage to this group of examinees. CONCLUSIONS: Medical item banks presently used for incourse assessment might be advantageously employed in adaptive testing. However, it is important to evaluate the match between the items and the measurement objective of the test before implementing this format.}, keywords = {*Automation, *Education, Medical, Undergraduate, Educational Measurement/*methods, Humans, Internal Medicine/*education, Likelihood Functions, Psychometrics/*methods, Reproducibility of Results}, isbn = {1040-2446 (Print)}, author = {Kreiter, C. D. and Ferguson, K. and Gruppen, L. D.} } @conference {382, title = {A comparison of the traditional maximum information method and the global information method in CAT item selection}, booktitle = {annual meeting of the National Council on Measurement in Education}, year = {1996}, address = {New York, NY USA}, keywords = {computerized adaptive testing, item selection}, author = {Tang, K. L.} }