@article {2752, title = {An Extended Taxonomy of Variants of Computerized Adaptive Testing}, journal = {Journal of Computerized Adaptive Testing}, volume = {10}, year = {2023}, keywords = {Adaptive Testing, evidence-centered design, Item Response Theory, knowledge-based model construction, missingness}, issn = {2165-6592}, doi = {10.7333/2302-100101}, author = {Roy Levy and John T. Behrens and Robert J. Mislevy} } @article {2735, title = {Item Calibration Methods With Multiple Subscale Multistage Testing}, journal = {Journal of Educational Measurement}, volume = {57}, number = {1}, year = {2020}, pages = {3-28}, abstract = {Abstract Many large-scale educational surveys have moved from linear form design to multistage testing (MST) design. One advantage of MST is that it can provide more accurate latent trait (θ) estimates using fewer items than required by linear tests. However, MST generates incomplete response data by design; hence, questions remain as to how to calibrate items using the incomplete data from MST design. Further complication arises when there are multiple correlated subscales per test, and when items from different subscales need to be calibrated according to their respective score reporting metric. The current calibration-per-subscale method produced biased item parameters, and there is no available method for resolving the challenge. Deriving from the missing data principle, we showed when calibrating all items together the Rubin{\textquoteright}s ignorability assumption is satisfied such that the traditional single-group calibration is sufficient. When calibrating items per subscale, we proposed a simple modification to the current calibration-per-subscale method that helps reinstate the missing-at-random assumption and therefore corrects for the estimation bias that is otherwise existent. Three mainstream calibration methods are discussed in the context of MST, they are the marginal maximum likelihood estimation, the expectation maximization method, and the fixed parameter calibration. An extensive simulation study is conducted and a real data example from NAEP is analyzed to provide convincing empirical evidence.}, keywords = {EM, marginal maximum likelihood, missing data, multistage testing}, doi = {10.1111/jedm.12241}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/jedm.12241}, author = {Wang, Chun and Chen, Ping and Jiang, Shengyu} } @conference {2653, title = {Adaptivity in a Diagnostic Educational Test}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

During the past five years a diagnostic educational test for three subjects (writing Dutch, writing English and math) has been developed in the Netherlands. The test informs students and their teachers about the students\’ strengths and weaknesses in such a manner that the learning process can be adjusted to their personal needs. It is a computer-based assessment for students in five different educational tracks midway secondary education that can yield diagnoses of many sub-skills. One of the main challenges at the outset of the development was to devise a way to deliver many diagnoses within a reasonably testing time. The answer to this challenge was to make the DET adaptive.

In this presentation we will discuss first how the adaptivity is shaped towards the purpose of the Diagnostic Educational Test. The adaptive design, particularly working with item blocks, will be discussed as well as the implemented adaptive rules. We will also show a simulation of different adaptive paths of students and some empirical information on the paths students took through the test

Session Video

}, keywords = {CAT, Diagnostic tests, Education}, author = {Sanneke Schouwstra} } @conference {2652, title = {Efficiency of Targeted Multistage Calibration Designs under Practical Constraints: A Simulation Study}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Calibration of an item bank for computer adaptive testing requires substantial resources. In this study, we focused on two related research questions. First, we investigated whether the efficiency of item calibration under the Rasch model could be enhanced by calibration designs that optimize the match between item difficulty and student ability (Berger, 1991). Therefore, we introduced targeted multistage calibration designs, a design type that refers to a combination of traditional targeted calibration designs and multistage designs. As such, targeted multistage calibration designs consider ability-related background variables (e.g., grade in school), as well as performance (i.e., outcome of a preceding test stage) for assigning students to suitable items.

Second, we explored how limited a priori knowledge about item difficulty affects the efficiency of both targeted calibration designs and targeted multistage calibration designs. When arranging items within a given calibration design, test developers need to know the item difficulties to locate items optimally within the design. However, usually, no empirical information about item difficulty is available before item calibration. Owing to missing empirical data, test developers might fail to assign all items to the most suitable location within a calibration design.

Both research questions were addressed in a simulation study in which we varied the calibration design, as well as the accuracy of item distribution across the different booklets or modules within each design (i.e., number of misplaced items). The results indicated that targeted multistage calibration designs were more efficient than ordinary targeted designs under optimal conditions. Especially, targeted multistage calibration designs provided more accurate estimates for very easy and 52 IACAT 2017 ABSTRACTS BOOKLET very difficult items. Limited knowledge about item difficulty during test construction impaired the efficiency of all designs. The loss of efficiency was considerably large for one of the two investigated targeted multistage calibration designs, whereas targeted designs were more robust.

References

Berger, M. P. F. (1991). On the efficiency of IRT models when applied to different sampling designs. Applied Psychological Measurement, 15(3), 293\–306. doi:10.1177/014662169101500310

Session Video

}, keywords = {CAT, Efficiency, Multistage Calibration}, url = {https://drive.google.com/file/d/1ko2LuiARKqsjL_6aupO4Pj9zgk6p_xhd/view?usp=sharing}, author = {Stephanie Berger and Angela J. Verschoor and Theo Eggen and Urs Moser} } @conference {2669, title = {FastCAT {\textendash} Customizing CAT Administration Rules to Increase Response Efficiency}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

A typical pre-requisite for CAT administration is the existence of an underlying item bank completely covering the range of the trait being measured. When a bank fails to cover the full range of the trait, examinees who are close to the floor or ceiling will often never achieve a standard error cut-off and examinees will be forced to answer items increasingly less relevant to their trait level. This scenario is fairly typical for many patients responding to patient reported outcome measures (PROMS). For IACAT 2017 ABSTRACTS BOOKLET 65 example, in the assessment of physical functioning, many item banks ceiling at about the 50\%ile. For most healthy patients, after a few items the only items remaining in the bank will represent decreasing ability (even though the patient has already indicated that they are at or above the mean for the population). Another example would be for a patient with no pain taking a Pain CAT. They will probably answer \“Never\” pain for every succeeding item out to the maximum test length. For this project we sought to reduce patient burden, while maintaining test accuracy, through the reduction of CAT length using novel stopping rules.

We studied CAT administration assessment histories for patients who were administered Patient Reported Outcomes Measurement Information System (PROMIS) CATs. In the PROMIS 1 Wave 2 Back Pain/Depression Study, CATs were administered to N=417 cases assessed across 11 PROMIS domains. Original CAT administration rules were: start with a pre-identified item of moderate difficulty; administer a minimum four items per case; stop when an estimated theta\’s SE declines to \< 0.3 OR a maximum 12 items are administered.

Original CAT. 12,622 CAT administrations were analyzed. CATs ranged in number of items administered from 4 to 12 items; 72.5\% were 4-item CATs. The second and third most frequently occurring CATs were 5-item (n=1102; 8.7\%) and 12-item CATs (n=964; 7.6\%). 64,062 items total were administered, averaging 5.1 items per CAT. Customized CAT. Three new CAT stopping rules were introduced, each with potential to increase item-presentation efficiency and maintain required score precision: Stop if a case responds to the first two items administered using an \“extreme\” response category (towards the ceiling or floor for the in item bank, or at ); administer a minimum two items per case; stop if the change in SE estimate (previous to current item administration) is positive but \< 0.01.

The three new stopping rules reduced the total number of items administered by 25,643 to 38,419 items (40.0\% reduction). After four items were administered, only n=1,824 CATs (14.5\%) were still in assessment mode (vs. n=3,477 (27.5\%) in the original CATs). On average, cases completed 3.0 items per CAT (vs. 5.1).

Each new rule addressed specific inefficiencies in the original CAT administration process: Cases not having or possessing a low/clinically unimportant level of the assessed domain; allow the SE \<0.3 stopping criterion to come into effect earlier in the CAT administration process; cases experiencing poor domain item bank measurement, (e.g., \“floor,\” \“ceiling\” cases).

}, keywords = {Administration Rules, Efficiency, FastCAT}, url = {https://drive.google.com/open?id=1oPJV-x0p9hRmgJ7t6k-MCC1nAoBSFM1w}, author = {Richard C. Gershon} } @article {2491, title = {Effect of Imprecise Parameter Estimation on Ability Estimation in a Multistage Test in an Automatic Item Generation Context }, journal = {Journal of Computerized Adaptive Testing}, volume = {4}, year = {2016}, pages = {1-18}, keywords = {Adaptive Testing, automatic item generation, errors in item parameters, item clones, multistage testing}, issn = {2165-6592 }, doi = {10.7333/1608-040101}, url = {http://iacat.org/jcat/index.php/jcat/article/view/59/27}, author = {Colvin, Kimberly and Keller, Lisa A and Robin, Frederic} } @article {2292, title = {The Philosophical Aspects of IRT Equating: Modeling Drift to Evaluate Cohort Growth in Large-Scale Assessments}, journal = {Educational Measurement: Issues and Practice}, volume = {32}, number = {1}, year = {2013}, pages = {2{\textendash}14}, keywords = {cohort growth, construct-relevant drift, evaluation of scale drift, philosophical aspects of IRT equating}, issn = {1745-3992}, doi = {10.1111/emip.12000}, url = {http://dx.doi.org/10.1111/emip.12000}, author = {Taherbhai, Husein and Seo, Daeryong} } @article {2070, title = {catR: An R Package for Computerized Adaptive Testing}, journal = {Applied Psychological Measurement}, year = {2011}, abstract = {

Computerized adaptive testing (CAT) is an active current research field in psychometrics and educational measurement. However, there is very little software available to handle such adaptive tasks. The R package catR was developed to perform adaptive testing with as much flexibility as possible, in an attempt to provide a developmental and testing platform to the interested user. Several item-selection rules and ability estimators are implemented. The item bank can be provided by the user or randomly generated from parent distributions of item parameters. Three stopping rules are available. The output can be graphically displayed.

}, keywords = {computer program, computerized adaptive testing, Estimation, Item Response Theory}, doi = {10.1177/0146621611407482}, author = {Magis, D. and Ra{\^\i}che, G.} } @article {2, title = {Deterioro de par{\'a}metros de los {\'\i}tems en tests adaptativos informatizados: estudio con eCAT [Item parameter drift in computerized adaptive testing: Study with eCAT]}, journal = {Psicothema}, volume = {22}, number = {2}, year = {2010}, note = {Abad, Francisco JOlea, JulioAguado, DavidPonsoda, VicenteBarrada, Juan REnglish AbstractSpainPsicothemaPsicothema. 2010 May;22(2):340-7.}, pages = {340-7}, edition = {2010/04/29}, abstract = {

En el presente trabajo se muestra el an\álisis realizado sobre un Test Adaptativo Informatizado (TAI) dise\ñado para la evaluaci\ón del nivel de ingl\és, denominado eCAT, con el objetivo de estudiar el deterioro de par\ámetros (parameter drift) producido desde la calibraci\ón inicial del banco de \ítems. Se ha comparado la calibraci\ón original desarrollada para la puesta en servicio del TAI (N= 3224) y la calibraci\ón actual obtenida con las aplicaciones reales del TAI (N= 7254). Se ha analizado el Funcionamiento Diferencial de los \Ítems (FDI) en funci\ón de los par\ámetros utilizados y se ha simulado el impacto que sobre el nivel de rasgo estimado tiene la variaci\ón en los par\ámetros. Los resultados muestran que se produce especialmente un deterioro de los par\ámetros a y c, que hay unimportante n\úmero de \ítems del banco para los que existe FDI y que la variaci\ón de los par\ámetros produce un impacto moderado en la estimaci\ón de \θ de los evaluados con nivel de ingl\és alto. Se concluye que los par\ámetros de los \ítems se han deteriorado y deben ser actualizados.Item parameter drift in computerized adaptive testing: Study with eCAT. This study describes the parameter drift analysis conducted on eCAT (a Computerized Adaptive Test to assess the written English level of Spanish speakers). The original calibration of the item bank (N = 3224) was compared to a new calibration obtained from the data provided by most eCAT operative administrations (N =7254). A Differential Item Functioning (DIF) study was conducted between the original and the new calibrations. The impact that the new parameters have on the trait level estimates was obtained by simulation. Results show that parameter drift is found especially for a and c parameters, an important number of bank items show DIF, and the parameter change has a moderate impact on high-level-English \θ estimates. It is then recommended to replace the original estimates by the new set. by the new set.

}, keywords = {*Software, Educational Measurement/*methods/*statistics \& numerical data, Humans, Language}, isbn = {0214-9915 (Print)0214-9915 (Linking)}, author = {Abad, F. J. and Olea, J. and Aguado, D. and Ponsoda, V. and Barrada, J} } @article {77, title = {The maximum priority index method for severely constrained item selection in computerized adaptive testing}, journal = {British Journal of Mathematical and Statistical Psychology}, volume = {62}, number = {2}, year = {2009}, note = {Cheng, YingChang, Hua-HuaResearch Support, Non-U.S. Gov{\textquoteright}tEnglandThe British journal of mathematical and statistical psychologyBr J Math Stat Psychol. 2009 May;62(Pt 2):369-83. Epub 2008 Jun 2.}, month = {May}, pages = {369-83}, edition = {2008/06/07}, abstract = {This paper introduces a new heuristic approach, the maximum priority index (MPI) method, for severely constrained item selection in computerized adaptive testing. Our simulation study shows that it is able to accommodate various non-statistical constraints simultaneously, such as content balancing, exposure control, answer key balancing, and so on. Compared with the weighted deviation modelling method, it leads to fewer constraint violations and better exposure control while maintaining the same level of measurement precision.}, keywords = {Aptitude Tests/*statistics \& numerical data, Diagnosis, Computer-Assisted/*statistics \& numerical data, Educational Measurement/*statistics \& numerical data, Humans, Mathematical Computing, Models, Statistical, Personality Tests/*statistics \& numerical data, Psychometrics/*statistics \& numerical data, Reproducibility of Results, Software}, isbn = {0007-1102 (Print)0007-1102 (Linking)}, author = {Cheng, Y and Chang, Hua-Hua} } @article {112, title = {A mixed integer programming model for multiple stage adaptive testing}, journal = {European Journal of Operational Research}, volume = {193}, number = {2}, year = {2009}, note = {doi: DOI: 10.1016/j.ejor.2007.10.047}, pages = {342-350}, abstract = {The last decade has seen paper-and-pencil (P\&P) tests being replaced by computerized adaptive tests (CATs) within many testing programs. A CAT may yield several advantages relative to a conventional P\&P test. A CAT can determine the questions or test items to administer, allowing each test form to be tailored to a test taker{\textquoteright}s skill level. Subsequent items can be chosen to match the capability of the test taker. By adapting to a test taker{\textquoteright}s ability, a CAT can acquire more information about a test taker while administering fewer items. A Multiple Stage Adaptive test (MST) provides a means to implement a CAT that allows review before the administration. The MST format is a hybrid between the conventional P\&P and CAT formats. This paper presents mixed integer programming models for MST assembly problems. Computational results with commercial optimization software will be given and advantages of the models evaluated.}, keywords = {Education, Integer programming, Linear programming}, isbn = {0377-2217}, author = {Edmonds, J. and Armstrong, R. D.} } @article {241, title = {Binary items and beyond: a simulation of computer adaptive testing using the Rasch partial credit model}, journal = {Journal of Applied Measurement}, volume = {9}, number = {1}, year = {2008}, note = {Lange, RenseUnited StatesJournal of applied measurementJ Appl Meas. 2008;9(1):81-104.}, pages = {81-104}, edition = {2008/01/09}, abstract = {Past research on Computer Adaptive Testing (CAT) has focused almost exclusively on the use of binary items and minimizing the number of items to be administrated. To address this situation, extensive computer simulations were performed using partial credit items with two, three, four, and five response categories. Other variables manipulated include the number of available items, the number of respondents used to calibrate the items, and various manipulations of respondents{\textquoteright} true locations. Three item selection strategies were used, and the theoretically optimal Maximum Information method was compared to random item selection and Bayesian Maximum Falsification approaches. The Rasch partial credit model proved to be quite robust to various imperfections, and systematic distortions did occur mainly in the absence of sufficient numbers of items located near the trait or performance levels of interest. The findings further indicate that having small numbers of items is more problematic in practice than having small numbers of respondents to calibrate these items. Most importantly, increasing the number of response categories consistently improved CAT{\textquoteright}s efficiency as well as the general quality of the results. In fact, increasing the number of response categories proved to have a greater positive impact than did the choice of item selection method, as the Maximum Information approach performed only slightly better than the Maximum Falsification approach. Accordingly, issues related to the efficiency of item selection methods are far less important than is commonly suggested in the literature. However, being based on computer simulations only, the preceding presumes that actual respondents behave according to the Rasch model. CAT research could thus benefit from empirical studies aimed at determining whether, and if so, how, selection strategies impact performance.}, keywords = {*Data Interpretation, Statistical, *User-Computer Interface, Educational Measurement/*statistics \& numerical data, Humans, Illinois, Models, Statistical}, isbn = {1529-7713 (Print)1529-7713 (Linking)}, author = {Lange, R.} } @article {5, title = {Efficiency and sensitivity of multidimensional computerized adaptive testing of pediatric physical functioning}, journal = {Disability \& Rehabilitation}, volume = {30}, number = {6}, year = {2008}, note = {Allen, Diane DNi, PengshengHaley, Stephen MK02 HD45354-01/HD/NICHD NIH HHS/United StatesNIDDR H133P0001/DD/NCBDD CDC HHS/United StatesResearch Support, N.I.H., ExtramuralEnglandDisability and rehabilitationDisabil Rehabil. 2008;30(6):479-84.}, pages = {479-84}, edition = {2008/02/26}, abstract = {PURPOSE: Computerized adaptive tests (CATs) have efficiency advantages over fixed-length tests of physical functioning but may lose sensitivity when administering extremely low numbers of items. Multidimensional CATs may efficiently improve sensitivity by capitalizing on correlations between functional domains. Using a series of empirical simulations, we assessed the efficiency and sensitivity of multidimensional CATs compared to a longer fixed-length test. METHOD: Parent responses to the Pediatric Evaluation of Disability Inventory before and after intervention for 239 children at a pediatric rehabilitation hospital provided the data for this retrospective study. Reliability, effect size, and standardized response mean were compared between full-length self-care and mobility subscales and simulated multidimensional CATs with stopping rules at 40, 30, 20, and 10 items. RESULTS: Reliability was lowest in the 10-item CAT condition for the self-care (r = 0.85) and mobility (r = 0.79) subscales; all other conditions had high reliabilities (r > 0.94). All multidimensional CAT conditions had equivalent levels of sensitivity compared to the full set condition for both domains. CONCLUSIONS: Multidimensional CATs efficiently retain the sensitivity of longer fixed-length measures even with 5 items per dimension (10-item CAT condition). Measuring physical functioning with multidimensional CATs could enhance sensitivity following intervention while minimizing response burden.}, keywords = {*Disability Evaluation, Child, Computers, Disabled Children/*classification/rehabilitation, Efficiency, Humans, Outcome Assessment (Health Care), Psychometrics, Reproducibility of Results, Retrospective Studies, Self Care, Sensitivity and Specificity}, isbn = {0963-8288 (Print)0963-8288 (Linking)}, author = {Allen, D. D. and Ni, P. and Haley, S. M.} } @article {293, title = {The NAPLEX: evolution, purpose, scope, and educational implications}, journal = {American Journal of Pharmaceutical Education}, volume = {72}, number = {2}, year = {2008}, note = {Newton, David WBoyle, MariaCatizone, Carmen AHistorical ArticleUnited StatesAmerican journal of pharmaceutical educationAm J Pharm Educ. 2008 Apr 15;72(2):33.}, month = {Apr 15}, pages = {33}, edition = {2008/05/17}, abstract = {Since 2004, passing the North American Pharmacist Licensure Examination (NAPLEX) has been a requirement for earning initial pharmacy licensure in all 50 United States. The creation and evolution from 1952-2005 of the particular pharmacy competency testing areas and quantities of questions are described for the former paper-and-pencil National Association of Boards of Pharmacy Licensure Examination (NABPLEX) and the current candidate-specific computer adaptive NAPLEX pharmacy licensure examinations. A 40\% increase in the weighting of NAPLEX Blueprint Area 2 in May 2005, compared to that in the preceding 1997-2005 Blueprint, has implications for candidates{\textquoteright} NAPLEX performance and associated curricular content and instruction. New pharmacy graduates{\textquoteright} scores on the NAPLEX are neither intended nor validated to serve as a criterion for assessing or judging the quality or effectiveness of pharmacy curricula and instruction. The newest cycle of NAPLEX Blueprint revision, a continual process to ensure representation of nationwide contemporary practice, began in early 2008. It may take up to 2 years, including surveying several thousand national pharmacists, to complete.}, keywords = {*Educational Measurement, Education, Pharmacy/*standards, History, 20th Century, History, 21st Century, Humans, Licensure, Pharmacy/history/*legislation \& jurisprudence, North America, Pharmacists/*legislation \& jurisprudence, Software}, isbn = {1553-6467 (Electronic)0002-9459 (Linking)}, author = {Newton, D. W. and Boyle, M. and Catizone, C. A.} } @article {17, title = {Rotating item banks versus restriction of maximum exposure rates in computerized adaptive testing}, journal = {Spanish Journal of Psychology}, volume = {11}, number = {2}, year = {2008}, note = {Barrada, Juan RamonOlea, JulioAbad, Francisco JoseResearch Support, Non-U.S. Gov{\textquoteright}tSpainThe Spanish journal of psychologySpan J Psychol. 2008 Nov;11(2):618-25.}, pages = {618-625}, edition = {2008/11/08}, abstract = {

If examinees were to know, beforehand, part of the content of a computerized adaptive test, their estimated trait levels would then have a marked positive bias. One of the strategies to avoid this consists of dividing a large item bank into several sub-banks and rotating the sub-bank employed (Ariel, Veldkamp \& van der Linden, 2004). This strategy permits substantial improvements in exposure control at little cost to measurement accuracy, However, we do not know whether this option provides better results than using the master bank with greater restriction in the maximum exposure rates (Sympson \& Hetter, 1985). In order to investigate this issue, we worked with several simulated banks of 2100 items, comparing them, for RMSE and overlap rate, with the same banks divided in two, three... up to seven sub-banks. By means of extensive manipulation of the maximum exposure rate in each bank, we found that the option of rotating banks slightly outperformed the option of restricting maximum exposure rate of the master bank by means of the Sympson-Hetter method.

}, keywords = {*Character, *Databases, *Software Design, Aptitude Tests/*statistics \& numerical data, Bias (Epidemiology), Computing Methodologies, Diagnosis, Computer-Assisted/*statistics \& numerical data, Educational Measurement/*statistics \& numerical data, Humans, Mathematical Computing, Psychometrics/statistics \& numerical data}, isbn = {1138-7416}, author = {Barrada, J and Olea, J. and Abad, F. J.} } @article {111, title = {Evaluation of computer adaptive testing systems}, journal = {International Journal of Web-Based Learning and Teaching Technologies}, volume = {2}, number = {1}, year = {2007}, pages = {70-87}, publisher = {IGI Global: US}, abstract = {Many educational organizations are trying to reduce the cost of the exams, the workload and delay of scoring, and the human errors. Also, they try to increase the accuracy and efficiency of the testing. Recently, most examination organizations use computer adaptive testing (CAT) as the method for large scale testing. This article investigates the current state of CAT systems and identifies their strengths and weaknesses. It evaluates 10 CAT systems using an evaluation framework of 15 domains categorized into three dimensions: educational, technical, and economical. The results show that the majority of the CAT systems give priority to security, reliability, and maintainability. However, they do not offer to the examinee any advanced support and functionalities. Also, the feedback to the examinee is limited and the presentation of the items is poor. Recommendations are made in order to enhance the overall quality of a CAT system. For example, alternative multimedia items should be available so that the examinee would choose a preferred media type. Feedback could be improved by providing more information to the examinee or providing information anytime the examinee wished. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive testing systems, examination organizations, systems evaluation}, isbn = {1548-1093 (Print); 1548-1107 (Electronic)}, author = {Economides, A. A. and Roupas, C} } @article {328, title = {Psychometric evaluation and calibration of health-related quality of life item banks: plans for the Patient-Reported Outcomes Measurement Information System (PROMIS)}, journal = {Medical Care}, volume = {45}, number = {5 Suppl 1}, year = {2007}, note = {Reeve, Bryce BHays, Ron DBjorner, Jakob BCook, Karon FCrane, Paul KTeresi, Jeanne AThissen, DavidRevicki, Dennis AWeiss, David JHambleton, Ronald KLiu, HonghuGershon, RichardReise, Steven PLai, Jin-sheiCella, DavidPROMIS Cooperative GroupAG015815/AG/United States NIAResearch Support, N.I.H., ExtramuralUnited StatesMedical careMed Care. 2007 May;45(5 Suppl 1):S22-31.}, month = {May}, pages = {S22-31}, edition = {2007/04/20}, abstract = {BACKGROUND: The construction and evaluation of item banks to measure unidimensional constructs of health-related quality of life (HRQOL) is a fundamental objective of the Patient-Reported Outcomes Measurement Information System (PROMIS) project. OBJECTIVES: Item banks will be used as the foundation for developing short-form instruments and enabling computerized adaptive testing. The PROMIS Steering Committee selected 5 HRQOL domains for initial focus: physical functioning, fatigue, pain, emotional distress, and social role participation. This report provides an overview of the methods used in the PROMIS item analyses and proposed calibration of item banks. ANALYSES: Analyses include evaluation of data quality (eg, logic and range checking, spread of response distribution within an item), descriptive statistics (eg, frequencies, means), item response theory model assumptions (unidimensionality, local independence, monotonicity), model fit, differential item functioning, and item calibration for banking. RECOMMENDATIONS: Summarized are key analytic issues; recommendations are provided for future evaluations of item banks in HRQOL assessment.}, keywords = {*Health Status, *Information Systems, *Quality of Life, *Self Disclosure, Adolescent, Adult, Aged, Calibration, Databases as Topic, Evaluation Studies as Topic, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Psychometrics, Questionnaires/standards, United States}, isbn = {0025-7079 (Print)}, author = {Reeve, B. B. and Hays, R. D. and Bjorner, J. B. and Cook, K. F. and Crane, P. K. and Teresi, J. A. and Thissen, D. and Revicki, D. A. and Weiss, D. J. and Hambleton, R. K. and Liu, H. and Gershon, R. C. and Reise, S. P. and Lai, J. S. and Cella, D.} } @article {343, title = {Psychometric properties of an emotional adjustment measure: An application of the graded response model}, journal = {European Journal of Psychological Assessment}, volume = {23}, number = {1}, year = {2007}, pages = {39-46}, publisher = {Hogrefe \& Huber Publishers GmbH: Germany}, abstract = {Item response theory (IRT) provides valuable methods for the analysis of the psychometric properties of a psychological measure. However, IRT has been mainly used for assessing achievements and ability rather than personality factors. This paper presents an application of the IRT to a personality measure. Thus, the psychometric properties of a new emotional adjustment measure that consists of a 28-six graded response items is shown. Classical test theory (CTT) analyses as well as IRT analyses are carried out. Samejima{\textquoteright}s (1969) graded-response model has been used for estimating item parameters. Results show that the bank of items fulfills model assumptions and fits the data reasonably well, demonstrating the suitability of the IRT models for the description and use of data originating from personality measures. In this sense, the model fulfills the expectations that IRT has undoubted advantages: (1) The invariance of the estimated parameters, (2) the treatment given to the standard error of measurement, and (3) the possibilities offered for the construction of computerized adaptive tests (CAT). The bank of items shows good reliability. It also shows convergent validity compared to the Eysenck Personality Inventory (EPQ-A; Eysenck \& Eysenck, 1975) and the Big Five Questionnaire (BFQ; Caprara, Barbaranelli, \& Borgogni, 1993). (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive tests, Emotional Adjustment, Item Response Theory, Personality Measures, personnel recruitment, Psychometrics, Samejima{\textquoteright}s graded response model, test reliability, validity}, isbn = {1015-5759 (Print)}, author = {Rubio, V. J. and Aguado, D. and Hontangas, P. M. and Hern{\'a}ndez, J. M.} } @article {247, title = {Comparing methods of assessing differential item functioning in a computerized adaptive testing environment}, journal = {Journal of Educational Measurement}, volume = {43}, number = {3}, year = {2006}, pages = {245-264}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Mantel-Haenszel and SIBTEST, which have known difficulty in detecting non-unidirectional differential item functioning (DIF), have been adapted with some success for computerized adaptive testing (CAT). This study adapts logistic regression (LR) and the item-response-theory-likelihood-ratio test (IRT-LRT), capable of detecting both unidirectional and non-unidirectional DIF, to the CAT environment in which pretest items are assumed to be seeded in CATs but not used for trait estimation. The proposed adaptation methods were evaluated with simulated data under different sample size ratios and impact conditions in terms of Type I error, power, and specificity in identifying the form of DIF. The adapted LR and IRT-LRT procedures are more powerful than the CAT version of SIBTEST for non-unidirectional DIF detection. The good Type I error control provided by IRT-LRT under extremely unequal sample sizes and large impact is encouraging. Implications of these and other findings are discussed. all rights reserved)}, keywords = {computerized adaptive testing, educational testing, item response theory likelihood ratio test, logistic regression, trait estimation, unidirectional \& non-unidirectional differential item functioning}, isbn = {0022-0655 (Print)}, author = {Lei, P-W. and Chen, S-Y. and Yu, L.} } @inbook {109, title = {Computer-based testing}, booktitle = {Handbook of multimethod measurement in psychology}, volume = {xiv}, year = {2006}, note = {Using Smart Source ParsingHandbook of multimethod measurement in psychology. (pp. 87-100). Washington, DC : American Psychological Association, [URL:http://www.apa.org/books]. xiv, 553 pp}, pages = {87-100}, publisher = {American Psychological Association}, organization = {American Psychological Association}, address = {Washington D.C. USA}, abstract = {(From the chapter) There has been a proliferation of research designed to explore and exploit opportunities provided by computer-based assessment. This chapter provides an overview of the diverse efforts by researchers in this area. It begins by describing how paper-and-pencil tests can be adapted for administration by computers. Computerization provides the important advantage that items can be selected so they are of appropriate difficulty for each examinee. Some of the psychometric theory needed for computerized adaptive testing is reviewed. Then research on innovative computerized assessments is summarized. These assessments go beyond multiple-choice items by using formats made possible by computerization. Then some hardware and software issues are described, and finally, directions for future work are outlined. (PsycINFO Database Record (c) 2006 APA )}, keywords = {Adaptive Testing computerized adaptive testing, Computer Assisted Testing, Experimentation, Psychometrics, Theories}, author = {F Drasgow and Chuah, S. C.} } @article {399, title = {Equating scores from adaptive to linear tests}, journal = {Applied Psychological Measurement}, volume = {30}, number = {6}, year = {2006}, pages = {493-508}, publisher = {Sage Publications: US}, abstract = {Two local methods for observed-score equating are applied to the problem of equating an adaptive test to a linear test. In an empirical study, the methods were evaluated against a method based on the test characteristic function (TCF) of the linear test and traditional equipercentile equating applied to the ability estimates on the adaptive test for a population of test takers. The two local methods were generally best. Surprisingly, the TCF method performed slightly worse than the equipercentile method. Both methods showed strong bias and uniformly large inaccuracy, but the TCF method suffered from extra error due to the lower asymptote of the test characteristic function. It is argued that the worse performances of the two methods are a consequence of the fact that they use a single equating transformation for an entire population of test takers and therefore have to compromise between the individual score distributions. }, keywords = {computerized adaptive testing, equipercentile equating, local equating, score reporting, test characteristic function}, isbn = {0146-6216 (Print)}, author = {van der Linden, W. J.} } @article {174, title = {Measurement precision and efficiency of multidimensional computer adaptive testing of physical functioning using the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {9}, year = {2006}, note = {Haley, Stephen MNi, PengshengLudlow, Larry HFragala-Pinkham, Maria AK02 hd45354-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, Non-U.S. Gov{\textquoteright}tUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Sep;87(9):1223-9.}, month = {Sep}, pages = {1223-9}, edition = {2006/08/29}, abstract = {OBJECTIVE: To compare the measurement efficiency and precision of a multidimensional computer adaptive testing (M-CAT) application to a unidimensional CAT (U-CAT) comparison using item bank data from 2 of the functional skills scales of the Pediatric Evaluation of Disability Inventory (PEDI). DESIGN: Using existing PEDI mobility and self-care item banks, we compared the stability of item calibrations and model fit between unidimensional and multidimensional Rasch models and compared the efficiency and precision of the U-CAT- and M-CAT-simulated assessments to a random draw of items. SETTING: Pediatric rehabilitation hospital and clinics. PARTICIPANTS: Clinical and normative samples. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Not applicable. RESULTS: The M-CAT had greater levels of precision and efficiency than the separate mobility and self-care U-CAT versions when using a similar number of items for each PEDI subdomain. Equivalent estimation of mobility and self-care scores can be achieved with a 25\% to 40\% item reduction with the M-CAT compared with the U-CAT. CONCLUSIONS: M-CAT applications appear to have both precision and efficiency advantages compared with separate U-CAT assessments when content subdomains have a high correlation. Practitioners may also realize interpretive advantages of reporting test score information for each subdomain when separate clinical inferences are desired.}, keywords = {*Disability Evaluation, *Pediatrics, Adolescent, Child, Child, Preschool, Computers, Disabled Persons/*classification/rehabilitation, Efficiency, Humans, Infant, Outcome Assessment (Health Care), Psychometrics, Self Care}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Ni, P. and Ludlow, L. H. and Fragala-Pinkham, M. A.} } @article {181, title = {Optimal and nonoptimal computer-based test designs for making pass-fail decisions}, journal = {Applied Measurement in Education}, volume = {19}, number = {3}, year = {2006}, pages = {221-239}, publisher = {Lawrence Erlbaum: US}, abstract = {Now that many credentialing exams are being routinely administered by computer, new computer-based test designs, along with item response theory models, are being aggressively researched to identify specific designs that can increase the decision consistency and accuracy of pass-fail decisions. The purpose of this study was to investigate the impact of optimal and nonoptimal multistage test (MST) designs, linear parallel-form test designs (LPFT), and computer adaptive test (CAT) designs on the decision consistency and accuracy of pass-fail decisions. Realistic testing situations matching those of one of the large credentialing agencies were simulated to increase the generalizability of the findings. The conclusions were clear: (a) With the LPFTs, matching test information functions (TIFs) to the mean of the proficiency distribution produced slightly better results than matching them to the passing score; (b) all of the test designs worked better than test construction using random selection of items, subject to content constraints only; (c) CAT performed better than the other test designs; and (d) if matching a TIP to the passing score, the MST design produced a bit better results than the LPFT design. If an argument for the MST design is to be made, it can be made on the basis of slight improvements over the LPFT design and better expected item bank utilization, candidate preference, and the potential for improved diagnostic feedback, compared with the feedback that is possible with fixed linear test forms. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {adaptive test, credentialing exams, Decision Making, Educational Measurement, multistage tests, optimal computer-based test designs, test form}, isbn = {0895-7347 (Print); 1532-4818 (Electronic)}, author = {Hambleton, R. K. and Xing, D.} } @article {319, title = {SIMCAT 1.0: A SAS computer program for simulating computer adaptive testing}, journal = {Applied Psychological Measurement}, volume = {30}, number = {1}, year = {2006}, pages = {60-61}, publisher = {Sage Publications: US}, abstract = {Monte Carlo methodologies are frequently applied to study the sampling distribution of the estimated proficiency level in adaptive testing. These methods eliminate real situational constraints. However, these Monte Carlo methodologies are not currently supported by the available software programs, and when these programs are available, their flexibility is limited. SIMCAT 1.0 is aimed at the simulation of adaptive testing sessions under different adaptive expected a posteriori (EAP) proficiency-level estimation methods (Blais \& Ra{\^\i}che, 2005; Ra{\^\i}che \& Blais, 2005) based on the one-parameter Rasch logistic model. These methods are all adaptive in the a priori proficiency-level estimation, the proficiency-level estimation bias correction, the integration interval, or a combination of these factors. The use of these adaptive EAP estimation methods diminishes considerably the shrinking, and therefore biasing, effect of the estimated a priori proficiency level encountered when this a priori is fixed at a constant value independently of the computed previous value of the proficiency level. SIMCAT 1.0 also computes empirical and estimated skewness and kurtosis coefficients, such as the standard error, of the estimated proficiency-level sampling distribution. In this way, the program allows one to compare empirical and estimated properties of the estimated proficiency-level sampling distribution under different variations of the EAP estimation method: standard error and bias, like the skewness and kurtosis coefficients. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive testing, computer program, estimated proficiency level, Monte Carlo methodologies, Rasch logistic model}, isbn = {0146-6216 (Print)}, author = {Ra{\^\i}che, G. and Blais, J-G.} } @article {192, title = {A computer-assisted test design and diagnosis system for use by classroom teachers}, journal = {Journal of Computer Assisted Learning}, volume = {21}, number = {6}, year = {2005}, pages = {419-429}, abstract = {Computer-assisted assessment (CAA) has become increasingly important in education in recent years. A variety of computer software systems have been developed to help assess the performance of students at various levels. However, such systems are primarily designed to provide objective assessment of students and analysis of test items, and focus has been mainly placed on higher and further education. Although there are commercial professional systems available for use by primary and secondary educational institutions, such systems are generally expensive and require skilled expertise to operate. In view of the rapid progress made in the use of computer-based assessment for primary and secondary students by education authorities here in the UK and elsewhere, there is a need to develop systems which are economic and easy to use and can provide the necessary information that can help teachers improve students{\textquoteright} performance. This paper presents the development of a software system that provides a range of functions including generating items and building item banks, designing tests, conducting tests on computers and analysing test results. Specifically, the system can generate information on the performance of students and test items that can be easily used to identify curriculum areas where students are under performing. A case study based on data collected from five secondary schools in Hong Kong involved in the Curriculum, Evaluation and Management Centre{\textquoteright}s Middle Years Information System Project, Durham University, UK, has been undertaken to demonstrate the use of the system for diagnostic and performance analysis. (PsycINFO Database Record (c) 2006 APA ) (journal abstract)}, keywords = {Computer Assisted Testing, Computer Software, Diagnosis, Educational Measurement, Teachers}, author = {He, Q. and Tymms, P.} } @article {4, title = {Propiedades psicom{\'e}tricas de un test Adaptativo Informatizado para la medici{\'o}n del ajuste emocional [Psychometric properties of an Emotional Adjustment Computerized Adaptive Test]}, journal = {Psicothema}, volume = {17}, number = {3}, year = {2005}, pages = {484-491}, abstract = {En el presente trabajo se describen las propiedades psicom{\'e}tricas de un Test Adaptativo Informatizado para la medici{\'o}n del ajuste emocional de las personas. La revisi{\'o}n de la literatura acerca de la aplicaci{\'o}n de los modelos de la teor{\'\i}a de la respuesta a los {\'\i}tems (TRI) muestra que {\'e}sta se ha utilizado m{\'a}s en el trabajo con variables aptitudinales que para la medici{\'o}n de variables de personalidad, sin embargo diversos estudios han mostrado la eficacia de la TRI para la descripci{\'o}n psicom{\'e}trica de dichasvariables. Aun as{\'\i}, pocos trabajos han explorado las caracter{\'\i}sticas de un Test Adaptativo Informatizado, basado en la TRI, para la medici{\'o}n de una variable de personalidad como es el ajuste emocional. Nuestros resultados muestran la eficiencia del TAI para la evaluaci{\'o}n del ajuste emocional, proporcionando una medici{\'o}n v{\'a}lida y precisa, utilizando menor n{\'u}mero de elementos de medida encomparaci{\'o}n con las escalas de ajuste emocional de instrumentos fuertemente implantados. Psychometric properties of an emotional adjustment computerized adaptive test. In the present work it was described the psychometric properties of an emotional adjustment computerized adaptive test. An examination of Item Response Theory (IRT) research literature indicates that IRT has been mainly used for assessing achievements and ability rather than personality factors. Nevertheless last years have shown several studies wich have successfully used IRT to personality assessment instruments. Even so, a few amount of works has inquired the computerized adaptative test features, based on IRT, for the measurement of a personality traits as it{\textquoteright}s the emotional adjustment. Our results show the CAT efficiency for the emotional adjustment assessment so this provides a valid and accurate measurement; by using a less number of items in comparison with the emotional adjustment scales from the most strongly established questionnaires.}, keywords = {Computer Assisted Testing, Emotional Adjustment, Item Response, Personality Measures, Psychometrics, Test Validity, Theory}, author = {Aguado, D. and Rubio, V. J. and Hontangas, P. M. and Hern{\'a}ndez, J. M.} } @article {338, title = {A computerized adaptive knowledge test as an assessment tool in general practice: a pilot study}, journal = {Medical Teacher}, volume = {26}, number = {2}, year = {2004}, note = {0142-159xJournal Article}, month = {Mar}, pages = {178-83}, abstract = {Advantageous to assessment in many fields, CAT (computerized adaptive testing) use in general practice has been scarce. In adapting CAT to general practice, the basic assumptions of item response theory and the case specificity must be taken into account. In this context, this study first evaluated the feasibility of converting written extended matching tests into CAT. Second, it questioned the content validity of CAT. A stratified sample of students was invited to participate in the pilot study. The items used in this test, together with their parameters, originated from the written test. The detailed test paths of the students were retained and analysed thoroughly. Using the predefined pass-fail standard, one student failed the test. There was a positive correlation between the number of items and the candidate{\textquoteright}s ability level. The majority of students were presented with questions in seven of the 10 existing domains. Although proved to be a feasible test format, CAT cannot substitute for the existing high-stakes large-scale written test. It may provide a reliable instrument for identifying candidates who are at risk of failing in the written test.}, keywords = {*Computer Systems, Algorithms, Educational Measurement/*methods, Family Practice/*education, Humans, Pilot Projects}, author = {Roex, A. and Degryse, J.} } @article {332, title = {Estimating ability and item-selection strategy in self-adapted testing: A latent class approach}, journal = {Journal of Educational and Behavioral Statistics}, volume = {29}, number = {4}, year = {2004}, pages = {379-396}, publisher = {American Educational Research Assn: US}, abstract = {This article presents a psychometric model for estimating ability and item-selection strategies in self-adapted testing. In contrast to computer adaptive testing, in self-adapted testing the examinees are allowed to select the difficulty of the items. The item-selection strategy is defined as the distribution of difficulty conditional on the responses given to previous items. The article shows that missing responses in self-adapted testing are missing at random and can be ignored in the estimation of ability. However, the item-selection strategy cannot always be ignored in such an estimation. An EM algorithm is presented to estimate an examinee{\textquoteright}s ability and strategies, and a model fit is evaluated using Akaike{\textquoteright}s information criterion. The article includes an application with real data to illustrate how the model can be used in practice for evaluating hypotheses, estimating ability, and identifying strategies. In the example, four strategies were identified and related to examinees{\textquoteright} ability. It was shown that individual examinees tended not to follow a consistent strategy throughout the test. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {estimating ability, item-selection strategies, psychometric model, self-adapted testing}, isbn = {1076-9986 (Print)}, author = {Revuelta, J.} } @article {245, title = {{\'E}valuation et multim{\'e}dia dans l{\textquoteright}apprentissage d{\textquoteright}une L2 [Assessment and multimedia in learning an L2]}, journal = {ReCALL}, volume = {16}, number = {2}, year = {2004}, pages = {475-487}, abstract = {In the first part of this paper different areas where technology may be used for second language assessment are described. First, item banking operations, which are generally based on item Response Theory but not necessarily restricted to dichotomously scored items, facilitate assessment task organization and require technological support. Second, technology may help to design more authentic assessment tasks or may be needed in some direct testing situations. Third, the assessment environment may be more adapted and more stimulating when technology is used to give the student more control. The second part of the paper presents different functions of assessment. The monitoring function (often called formative assessment) aims at adapting the classroom activities to students and to provide continuous feedback. Technology may be used to train the teachers in monitoring techniques, to organize data or to produce diagnostic information; electronic portfolios or quizzes that are built in some educational software may also be used for monitoring. The placement function is probably the one in which the application of computer adaptive testing procedures (e.g. French CAPT) is the most appropriate. Automatic scoring devices may also be used for placement purposes. Finally the certification function requires more valid and more reliable tools. Technology may be used to enhance the testing situation (to make it more authentic) or to facilitate data processing during the construction of a test. Almond et al. (2002) propose a four component model (Selection, Presentation, Scoring and Response) for designing assessment systems. Each component must be planned taking into account the assessment function. }, keywords = {Adaptive Testing, Computer Assisted Instruction, Educational, Foreign Language Learning, Program Evaluation, Technology computerized adaptive testing}, author = {Laurier, M.} } @inbook {42, title = {Assessing question banks}, booktitle = {Reusing online resources: A sustanable approach to e-learning}, number = {1}, year = {2003}, pages = {171-230}, publisher = {Kogan Page Ltd.}, organization = {Kogan Page Ltd.}, address = {London, UK}, abstract = {In Chapter 14, Joanna Bull and James Daziel provide a comprehensive treatment of the issues surrounding the use of Question Banks and Computer Assisted Assessment, and provide a number of excellent examples of implementations. In their review of the technologies employed in Computer Assisted Assessment the authors include Computer Adaptive Testing and data generation. The authors reveal significant issues involving the impact of Intellectual Property rights and computer assisted assessment and make important suggestions for strategies to overcome these obstacles. (PsycINFO Database Record (c) 2005 APA )http://www-jime.open.ac.uk/2003/1/ (journal abstract)}, keywords = {Computer Assisted Testing, Curriculum Based Assessment, Education, Technology computerized adaptive testing}, author = {Bull, J. and Dalziel, J. and Vreeland, T.} } @article {63, title = {A comparative study of item exposure control methods in computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {40}, number = {1}, year = {2003}, pages = {71-103}, abstract = {This study compared the properties of five methods of item exposure control within the purview of estimating examinees{\textquoteright} abilities in a computerized adaptive testing (CAT) context. Each exposure control algorithm was incorporated into the item selection procedure and the adaptive testing progressed based on the CAT design established for this study. The merits and shortcomings of these strategies were considered under different item pool sizes and different desired maximum exposure rates and were evaluated in light of the observed maximum exposure rates, the test overlap rates, and the conditional standard errors of measurement. Each method had its advantages and disadvantages, but no one possessed all of the desired characteristics. There was a clear and logical trade-off between item exposure control and measurement precision. The M. L. Stocking and C. Lewis conditional multinomial procedure and, to a slightly lesser extent, the T. Davey and C. G. Parshall method seemed to be the most promising considering all of the factors that this study addressed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Educational, Item Analysis (Statistical), Measurement, Strategies computerized adaptive testing}, author = {Chang, S-W. and Ansley, T. N.} } @article {94, title = {Item exposure constraints for testlets in the verbal reasoning section of the MCAT}, journal = {Applied Psychological Measurement}, volume = {27}, number = {5}, year = {2003}, pages = {335-356}, abstract = {The current study examined item exposure control procedures for testlet scored reading passages in the Verbal Reasoning section of the Medical College Admission Test with four computerized adaptive testing (CAT) systems using the partial credit model. The first system used a traditional CAT using maximum information item selection. The second used random item selection to provide a baseline for optimal exposure rates. The third used a variation of Lunz and Stahl{\textquoteright}s randomization procedure. The fourth used Luecht and Nungester{\textquoteright}s computerized adaptive sequential testing (CAST) system. A series of simulated fixed-length CATs was run to determine the optimal item length selection procedure. Results indicated that both the randomization procedure and CAST performed well in terms of exposure control and measurement precision, with the CAST system providing the best overall solution when all variables were taken into consideration. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Entrance Examinations, Item Response Theory, Random Sampling, Reasoning, Verbal Ability computerized adaptive testing}, author = {Davis, L. L. and Dodd, B. G.} } @article {14, title = {Data sparseness and on-line pretest item calibration-scaling methods in CAT}, journal = {Journal of Educational Measurement}, volume = {39}, number = {3}, year = {2002}, pages = {207-218}, abstract = {Compared and evaluated 3 on-line pretest item calibration-scaling methods (the marginal maximum likelihood estimate with 1 expectation maximization [EM] cycle [OEM] method, the marginal maximum likelihood estimate with multiple EM cycles [MEM] method, and M. L. Stocking{\textquoteright}s Method B) in terms of item parameter recovery when the item responses to the pretest items in the pool are sparse. Simulations of computerized adaptive tests were used to evaluate the results yielded by the three methods. The MEM method produced the smallest average total error in parameter estimation, and the OEM method yielded the largest total error (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, Educational Measurement, Item Response Theory, Maximum Likelihood, Methodology, Scaling (Testing), Statistical Data}, author = {Ban, J-C. and Hanson, B. A. and Yi, Q. and Harris, D. J.} } @article {418, title = {Mathematical-programming approaches to test item pool design}, number = {RR 02-09}, year = {2002}, note = {Using Smart Source ParsingAdvances in psychology research, Vol. ( Hauppauge, NY : Nova Science Publishers, Inc, [URL:http://www.Novapublishers.com]. vi, 228 pp}, pages = {93-108}, institution = {University of Twente, Faculty of Educational Science and Technology}, address = {Twente, The Netherlands}, abstract = {(From the chapter) This paper presents an approach to item pool design that has the potential to improve on the quality of current item pools in educational and psychological testing and hence to increase both measurement precision and validity. The approach consists of the application of mathematical programming techniques to calculate optimal blueprints for item pools. These blueprints can be used to guide the item-writing process. Three different types of design problems are discussed, namely for item pools for linear tests, item pools computerized adaptive testing (CAT), and systems of rotating item pools for CAT. The paper concludes with an empirical example of the problem of designing a system of rotating item pools for CAT.}, keywords = {Adaptive Testing, Computer Assisted, Computer Programming, Educational Measurement, Item Response Theory, Mathematics, Psychometrics, Statistical Rotation computerized adaptive testing, Test Items, Testing}, isbn = {02-09}, author = {Veldkamp, B. P. and van der Linden, W. J. and Ariel, A.} } @article {277, title = {Outlier detection in high-stakes certification testing}, journal = {Journal of Educational Measurement}, volume = {39}, number = {3}, year = {2002}, pages = {219-233}, abstract = {Discusses recent developments of person-fit analysis in computerized adaptive testing (CAT). Methods from statistical process control are presented that have been proposed to classify an item score pattern as fitting or misfitting the underlying item response theory model in CAT Most person-fit research in CAT is restricted to simulated data. In this study, empirical data from a certification test were used. Alternatives are discussed to generate norms so that bounds can be determined to classify an item score pattern as fitting or misfitting. Using bounds determined from a sample of a high-stakes certification test, the empirical analysis showed that different types of misfit can be distinguished Further applications using statistical process control methods to detect misfitting item score patterns are discussed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, computerized adaptive testing, Educational Measurement, Goodness of Fit, Item Analysis (Statistical), Item Response Theory, person Fit, Statistical Estimation, Statistical Power, Test Scores}, author = {Meijer, R. R.} } @inbook {108, title = {The work ahead: A psychometric infrastructure for computerized adaptive tests}, booktitle = {Computer-based tests: Building the foundation for future assessment}, year = {2002}, note = {Using Smart Source ParsingComputer-based testing: Building the foundation for future assessments. (pp. 1-35). Mahwah, NJ : Lawrence Erlbaum Associates, Publishers. xi, 326 pp}, publisher = {Lawrence Erlbaum Associates, Inc.}, organization = {Lawrence Erlbaum Associates, Inc.}, address = {Mahwah, N.J. USA}, abstract = {(From the chapter) Considers the past and future of computerized adaptive tests and computer-based tests and looks at issues and challenges confronting a testing program as it implements and operates a computer-based test. Recommendations for testing programs from The National Council of Measurement in Education Ad Hoc Committee on Computerized Adaptive Test Disclosure are appended. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Educational, Measurement, Psychometrics}, author = {F Drasgow}, editor = {M. P. Potenza and J. J. Freemer and W. C. Ward} } @article {188, title = {Item selection in computerized adaptive testing: Should more discriminating items be used first?}, journal = {Journal of Educational Measurement}, volume = {38}, number = {3}, year = {2001}, pages = {249-266}, abstract = {During computerized adaptive testing (CAT), items are selected continuously according to the test-taker{\textquoteright}s estimated ability. Test security has become a problem because high-discrimination items are more likely to be selected and become overexposed. So, there seems to be a tradeoff between high efficiency in ability estimations and balanced usage of items. This series of four studies addressed the dilemma by focusing on the notion of whether more or less discriminating items should be used first in CAT. The first study demonstrated that the common maximum information method with J. B. Sympson and R. D. Hetter (1985) control resulted in the use of more discriminating items first. The remaining studies showed that using items in the reverse order, as described in H. Chang and Z. Yings (1999) stratified method had potential advantages: (a) a more balanced item usage and (b) a relatively stable resultant item pool structure with easy and inexpensive management. This stratified method may have ability-estimation efficiency better than or close to that of other methods. It is argued that the judicious selection of items, as in the stratified method, is a more active control of item exposure. (PsycINFO Database Record (c) 2005 APA )}, keywords = {ability, Adaptive Testing, Computer Assisted Testing, Estimation, Statistical, Test Items computerized adaptive testing}, author = {Hau, Kit-Tai and Chang, Hua-Hua} } @article {297, title = {Requerimientos, aplicaciones e investigaci{\'o}n en tests adaptativos informatizados [Requirements, applications, and investigation in computerized adaptive testing]}, journal = {Apuntes de Psicologia}, volume = {19}, number = {1}, year = {2001}, pages = {11-28}, abstract = {Summarizes the main requirements and applications of computerized adaptive testing (CAT) with emphasis on the differences between CAT and conventional computerized tests. Psychometric properties of estimations based on CAT, item selection strategies, and implementation software are described. Results of CAT studies in Spanish-speaking samples are described. Implications for developing a CAT measuring the English vocabulary of Spanish-speaking students are discussed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, English as Second Language, Psychometrics computerized adaptive testing}, author = {Olea D{\'\i}az, J. and Ponsoda Gil, V. and Revuelta Men{\'e}ndez, J. and Hontangas Beltr{\'a}n, P. and Abad, F. J.} } @article {368, title = {CAT administration of language placement examinations}, journal = {Journal of Applied Measurement}, volume = {1}, number = {3}, year = {2000}, note = {1529-7713Journal Article}, pages = {292-302}, abstract = {This article describes the development of a computerized adaptive test for Cegep de Jonquiere, a community college located in Quebec, Canada. Computerized language proficiency testing allows the simultaneous presentation of sound stimuli as the question is being presented to the test-taker. With a properly calibrated bank of items, the language proficiency test can be offered in an adaptive framework. By adapting the test to the test-taker{\textquoteright}s level of ability, an assessment can be made with significantly fewer items. We also describe our initial attempt to detect instances in which "cheating low" is occurring. In the "cheating low" situation, test-takers deliberately answer questions incorrectly, questions that they are fully capable of answering correctly had they been taking the test honestly.}, keywords = {*Language, *Software, Aptitude Tests/*statistics \& numerical data, Educational Measurement/*statistics \& numerical data, Humans, Psychometrics, Reproducibility of Results, Research Support, Non-U.S. Gov{\textquoteright}t}, author = {Stahl, J. and Bergstrom, B. and Gershon, R. C.} } @article {232, title = {Lagrangian relaxation for constrained curve-fitting with binary variables: Applications in educational testing}, journal = {Dissertation Abstracts International Section A: Humanities and Social Sciences}, volume = {61}, number = {3-A}, year = {2000}, pages = {1063}, abstract = {This dissertation offers a mathematical programming approach to curve fitting with binary variables. Various Lagrangian Relaxation (LR) techniques are applied to constrained curve fitting. Applications in educational testing with respect to test assembly are utilized. In particular, techniques are applied to both static exams (i.e. conventional paper-and-pencil (P\&P)) and adaptive exams (i.e. a hybrid computerized adaptive test (CAT) called a multiple-forms structure (MFS)). This dissertation focuses on the development of mathematical models to represent these test assembly problems as constrained curve-fitting problems with binary variables and solution techniques for the test development. Mathematical programming techniques are used to generate parallel test forms with item characteristics based on item response theory. A binary variable is used to represent whether or not an item is present on a form. The problem of creating a test form is modeled as a network flow problem with additional constraints. In order to meet the target information and the test characteristic curves, a Lagrangian relaxation heuristic is applied to the problem. The Lagrangian approach works by multiplying the constraint by a "Lagrange multiplier" and adding it to the objective. By systematically varying the multiplier, the test form curves approach the targets. This dissertation explores modifications to Lagrangian Relaxation as it is applied to the classical paper-and-pencil exams. For the P\&P exams, LR techniques are also utilized to include additional practical constraints to the network problem, which limit the item selection. An MFS is a type of a computerized adaptive test. It is a hybrid of a standard CAT and a P\&P exam. The concept of an MFS will be introduced in this dissertation, as well as, the application of LR as it is applied to constructing parallel MFSs. The approach is applied to the Law School Admission Test for the assembly of the conventional P\&P test as well as an experimental computerized test using MFSs. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Analysis, Educational Measurement, Mathematical Modeling, Statistical}, author = {Koppel, N. B.} } @article {28, title = {Competency gradient for child-parent centers}, journal = {Journal of Outcomes Measurement}, volume = {3}, number = {1}, year = {1999}, note = {1090-655X (Print)Journal ArticleResearch Support, U.S. Gov{\textquoteright}t, P.H.S.}, pages = {35-52}, abstract = {This report describes an implementation of the Rasch model during the longitudinal evaluation of a federally-funded early childhood preschool intervention program. An item bank is described for operationally defining a psychosocial construct called community life-skills competency, an expected teenage outcome of the preschool intervention. This analysis examined the position of teenage students on this scale structure, and investigated a pattern of cognitive operations necessary for students to pass community life-skills test items. Then this scale structure was correlated with nationally standardized reading and math achievement scores, teacher ratings, and school records to assess its validity as a measure of the community-related outcome goal for this intervention. The results show a functional relationship between years of early intervention and magnitude of effect on the life-skills competency variable.}, keywords = {*Models, Statistical, Activities of Daily Living/classification/psychology, Adolescent, Chicago, Child, Child, Preschool, Early Intervention (Education)/*statistics \& numerical data, Female, Follow-Up Studies, Humans, Male, Outcome and Process Assessment (Health Care)/*statistics \& numerical data}, author = {Bezruczko, N.} } @article {234, title = {Evaluating the usefulness of computerized adaptive testing for medical in-course assessment}, journal = {Academic Medicine}, volume = {74}, number = {10}, year = {1999}, note = {Kreiter, C DFerguson, KGruppen, L DUnited statesAcademic medicine : journal of the Association of American Medical CollegesAcad Med. 1999 Oct;74(10):1125-8.}, month = {Oct}, pages = {1125-8}, edition = {1999/10/28}, abstract = {PURPOSE: This study investigated the feasibility of converting an existing computer-administered, in-course internal medicine test to an adaptive format. METHOD: A 200-item internal medicine extended matching test was used for this research. Parameters were estimated with commercially available software with responses from 621 examinees. A specially developed simulation program was used to retrospectively estimate the efficiency of the computer-adaptive exam format. RESULTS: It was found that the average test length could be shortened by almost half with measurement precision approximately equal to that of the full 200-item paper-and-pencil test. However, computer-adaptive testing with this item bank provided little advantage for examinees at the upper end of the ability continuum. An examination of classical item statistics and IRT item statistics suggested that adding more difficult items might extend the advantage to this group of examinees. CONCLUSIONS: Medical item banks presently used for incourse assessment might be advantageously employed in adaptive testing. However, it is important to evaluate the match between the items and the measurement objective of the test before implementing this format.}, keywords = {*Automation, *Education, Medical, Undergraduate, Educational Measurement/*methods, Humans, Internal Medicine/*education, Likelihood Functions, Psychometrics/*methods, Reproducibility of Results}, isbn = {1040-2446 (Print)}, author = {Kreiter, C. D. and Ferguson, K. and Gruppen, L. D.} } @article {177, title = {The effect of item pool restriction on the precision of ability measurement for a Rasch-based CAT: comparisons to traditional fixed length examinations}, journal = {J Outcome Meas}, volume = {2}, number = {2}, year = {1998}, note = {983263801090-655xJournal Article}, pages = {97-122}, abstract = {This paper describes a method for examining the precision of a computerized adaptive test with a limited item pool. Standard errors of measurement ascertained in the testing of simulees with a CAT using a restricted pool were compared to the results obtained in a live paper-and-pencil achievement testing of 4494 nursing students on four versions of an examination of calculations of drug administration. CAT measures of precision were considered when the simulated examine pools were uniform and normal. Precision indices were also considered in terms of the number of CAT items required to reach the precision of the traditional tests. Results suggest that regardless of the size of the item pool, CAT provides greater precision in measurement with a smaller number of items administered even when the choice of items is limited but fails to achieve equiprecision along the entire ability continuum.}, keywords = {*Decision Making, Computer-Assisted, Comparative Study, Computer Simulation, Education, Nursing, Educational Measurement/*methods, Human, Models, Statistical, Psychometrics/*methods}, author = {Halkitis, P. N.} } @article {137, title = {Methodologic trends in the healthcare professions: computer adaptive and computer simulation testing}, journal = {Nurse Education}, volume = {21}, number = {4}, year = {1996}, note = {Forker, J EMcDonald, M EUnited statesNurse educatorNurse Educ. 1996 Jul-Aug;21(4):13-4.}, month = {Jul-Aug}, pages = {13-4}, edition = {1996/07/01}, abstract = {Assessing knowledge and performance on computer is rapidly becoming a common phenomenon in testing and measurement. Computer adaptive testing presents an individualized test format in accordance with the examinee{\textquoteright}s ability level. The efficiency of the testing process enables a more precise estimate of performance, often with fewer items than traditional paper-and-pencil testing methodologies. Computer simulation testing involves performance-based, or authentic, assessment of the examinee{\textquoteright}s clinical decision-making abilities. The authors discuss the trends in assessing performance through computerized means and the application of these methodologies to community-based nursing practice.}, keywords = {*Clinical Competence, *Computer Simulation, Computer-Assisted Instruction/*methods, Educational Measurement/*methods, Humans}, isbn = {0363-3624 (Print)0363-3624 (Linking)}, author = {Forker, J. E. and McDonald, M. E.} } @article {217, title = {Moving in a new direction: Computerized adaptive testing (CAT)}, journal = {Nursing Management}, volume = {24}, number = {1}, year = {1993}, note = {Jones-Dickson, CDorsey, DCampbell-Warnock, JFields, FUnited statesNursing managementNurs Manage. 1993 Jan;24(1):80, 82.}, month = {Jan}, pages = {80, 82}, edition = {1993/01/01}, keywords = {*Computers, Accreditation/methods, Educational Measurement/*methods, Licensure, Nursing, United States}, isbn = {0744-6314 (Print)}, author = {Jones-Dickson, C. and Dorsey, D. and Campbell-Warnock, J. and Fields, F.} } @article {126, title = {Computerized adaptive testing for NCLEX-PN}, journal = {Journal of Practical Nursing}, volume = {42}, number = {2}, year = {1992}, note = {Fields, F AUnited statesThe Journal of practical nursingJ Pract Nurs. 1992 Jun;42(2):8-10.}, month = {Jun}, pages = {8-10}, edition = {1992/06/01}, keywords = {*Licensure, *Programmed Instruction, Educational Measurement/*methods, Humans, Nursing, Practical/*education}, isbn = {0022-3867 (Print)}, author = {Fields, F. A.} } @article {37, title = {Future directions for the National Council: the Computerized Adaptive Testing Project}, journal = {Issues}, volume = {11}, number = {4}, year = {1990}, note = {911613080885-0046Journal Article}, pages = {1, 3, 5}, keywords = {*Computers, *Licensure, Educational Measurement/*methods, Societies, Nursing, United States}, author = {Bouchard, J.} } @article {190, title = {National Council Computerized Adaptive Testing Project Review--committee perspective}, journal = {Issues}, volume = {11}, number = {4}, year = {1990}, note = {911613110885-0046Journal Article}, pages = {3}, keywords = {*Computers, *Licensure, Educational Measurement/*methods, Feasibility Studies, Societies, Nursing, United States}, author = {Haynes, B.} }