ChimeraTK-ApplicationCore 04.06.00
Loading...
Searching...
No Matches
testRecoveryGroups.cc
Go to the documentation of this file.
1// SPDX-FileCopyrightText: Deutsches Elektronen-Synchrotron DESY, MSK, ChimeraTK Project <chimeratk-support@desy.de>
2// SPDX-License-Identifier: LGPL-3.0-or-later
3#include <ChimeraTK/Device.h>
4
5#include <boost/smart_ptr/make_shared_object.hpp>
6#include <boost/thread/exceptions.hpp>
7
8#define BOOST_TEST_MODULE testRecoveryGroups
9
10#include "Application.h"
11#include "check_timeout.h"
12#include "DeviceManager.h"
13#include "DeviceModule.h"
14#include "ModuleGroup.h"
15#include "TestFacility.h"
16
17#include <ChimeraTK/BackendFactory.h>
18#include <ChimeraTK/cppext/finally.hpp>
19#include <ChimeraTK/Exception.h>
20#include <ChimeraTK/ExceptionDummyBackend.h>
21#include <ChimeraTK/LogicalNameMappingBackend.h>
22#include <ChimeraTK/NDRegisterAccessor.h>
23#include <ChimeraTK/ScalarRegisterAccessor.h>
24#include <ChimeraTK/VoidRegisterAccessor.h>
25
26namespace ctk = ChimeraTK;
27
28#include <boost/smart_ptr/shared_ptr.hpp>
29#include <boost/test/included/unit_test.hpp>
30
31#include <barrier>
32#include <cstdint>
33#include <string>
34
35// Helper class to have all variable names from a device prepended by the cdd/alias name
36// e.g. /Integers/unsigned32 from Use1 ends up in /Use1/Integers/unsigned32
38 DeviceModuleWithPath(ModuleGroup* owner, std::string const& cdd)
39 : ModuleGroup(owner, cdd, ""), dev(this, cdd, "/somepath/dummyTrigger") {}
41};
42
43// Test backend which allows to block write operations.
44struct WriteBlockingDummy : public ChimeraTK::ExceptionDummy {
45 using ExceptionDummy::ExceptionDummy;
46
47 std::atomic<bool> blockWriteOnce{false}; // Only use the following barrier if true.
48 std::barrier<> blockWriteArrivedBarrier{2}; // Tell the test thread that we are there
49 std::barrier<> blockWriteContinueBarrier{2}; // Wait for the test to tell us to continue.
50 std::atomic<bool> throwThreadInterrupted{false}; // Throw a boost::thread_interrupted exception.
51
52 void write(uint64_t bar, uint64_t address, int32_t const* data, size_t sizeInBytes) override {
53 if(blockWriteOnce.exchange(false)) { // atomically set to false and check the old value
54 (void)blockWriteArrivedBarrier.arrive(); // Notify the test.
55 blockWriteContinueBarrier.arrive_and_wait(); // Wait for the test to tell us to continue.
57 throw boost::thread_interrupted(); // NOLINT hicpp-exception-baseclass
58 }
59 }
60
61 ExceptionDummy::write(bar, address, data, sizeInBytes);
62 }
63
64 // NOLINTNEXTLINE performance-unnecessary-value-param (signature required like this by BackendFactory)
65 static boost::shared_ptr<DeviceBackend> creatorFunction(std::string, std::map<std::string, std::string> parameters) {
66 return boost::make_shared<WriteBlockingDummy>(parameters["map"]);
67 }
68
69 struct Registerer {
71 ctk::BackendFactory::getInstance().registerBackendType("WriteBlockingDummy", WriteBlockingDummy::creatorFunction);
72 }
73 };
74};
75
76static WriteBlockingDummy::Registerer writeBlockingBackendRegisterer;
77
78// Test backend which counts the number of open() calls and allows to block write operations.
79struct OpenCountingLmapBackend : public ChimeraTK::LogicalNameMappingBackend {
80 using LogicalNameMappingBackend::LogicalNameMappingBackend;
81
82 std::atomic<size_t> openCounter{0};
83
84 std::barrier<> aboutToThrowArrivedBarrier{2}; // Tell the test thread that we are there
85 std::barrier<> aboutToThrowContinueBarrier{2}; // Wait for the test to tell us to continue.
86 std::atomic<bool> throwThreadInterrupted{false}; // Throw a boost::thread_interrupted exception.
87 static std::atomic<size_t> globalOpenCounter; // Count the total number of open calls in throwing context.
88
89 std::atomic<bool> blockOpen{false};
90 std::barrier<> blockOpenArrivedBarrier{2};
91 std::barrier<> blockOpenContinueBarrier{2};
92
93 void open() override {
95 // Only block for the testing sequence when the other two devices have successfully opened (and are waiting at the
96 // barrier). This does not run in parallel due to the recovery groups "open mutex". If we block here, none of the
97 // other backends will get through because we are holding the lock.
98 if(globalOpenCounter == 2) {
99 (void)aboutToThrowArrivedBarrier.arrive();
100 aboutToThrowContinueBarrier.arrive_and_wait();
101 throw boost::thread_interrupted(); // NOLINT hicpp-exception-baseclass
102 }
103 // just retry later
104 throw ctk::runtime_error("Not ready to block yet");
105 }
106 // Handshake the barrier exactly once when requested. The first backend to arrive here will atomically check it and
107 // turn it off
108 if(blockOpen.exchange(false)) {
109 (void)blockOpenArrivedBarrier.arrive();
110 blockOpenContinueBarrier.arrive_and_wait();
111 }
112
113 ++openCounter;
115 ChimeraTK::LogicalNameMappingBackend::open();
116 }
117
118 // NOLINTNEXTLINE performance-unnecessary-value-param (signature required like this by BackendFactory)
119 static boost::shared_ptr<DeviceBackend> creatorFunction(std::string, std::map<std::string, std::string> parameters) {
120 auto ptr = boost::make_shared<OpenCountingLmapBackend>(parameters["map"]);
121 parameters.erase(parameters.find("map"));
122 ptr->_parameters = parameters;
123 return boost::static_pointer_cast<DeviceBackend>(ptr);
124 }
125
126 struct Registerer {
128 ctk::BackendFactory::getInstance().registerBackendType(
129 "OpenCountingLmapBackend", OpenCountingLmapBackend::creatorFunction);
130 }
131 };
132};
133
135static OpenCountingLmapBackend::Registerer testLmapBackendRegisterer;
136
137// A test application with 4 devices in 2 recovery groups.
138// It is used in most tests, and extended with initialisation handlers where needed.
140 explicit BasicTestApp(const std::string& name = "BasicTestApp") : Application(name) {}
141 ~BasicTestApp() override { shutdown(); }
142
143 ctk::SetDMapFilePath path{"recoveryGroups.dmap"};
144
145 // Recovery group: Two devices with one backend each, and a device which uses both of them
149
150 // Use3 is in its own recovery "group"
152};
153
154/**********************************************************************************************************************/
155
156template<class APP>
157struct Fixture {
159 ChimeraTK::TestFacility testFacility{testApp, /* enableTestableMode */ false};
160 ctk::VoidRegisterAccessor trigger{testFacility.getVoid("/somepath/dummyTrigger")};
161
164
166};
167
168/**********************************************************************************************************************/
169
181 // Pre-condition: wait until all devices are ok
182 // Necessary because we are not using the testable mode
183 for(auto const* dev : {"Use1", "Use2", "Use3", "Use12"}) {
184 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
185 }
186
187 // Test preparation: turn backend 1 into exception state
188 auto dummy1 = boost::dynamic_pointer_cast<ctk::ExceptionDummy>(raw1.getBackend());
189 dummy1->throwExceptionOpen = true;
190 dummy1->throwExceptionRead = true;
191
192 trigger.write();
193
194 // The actual test A.5: Check that Use1, Use2 and Use12 are in the same recovery group and thus have seen the error.
195 // Requirement for A.5.1
196 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
197 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 1, 10000);
198 }
199
200 // Test A.5.1: Use3 is in a different recovery and still OK
201 CHECK_TIMEOUT(testFacility.readScalar<int>("Devices/Use3/status") == 0, 10000);
202
203 // Remove error condition on raw1 and recover everything
204 dummy1->throwExceptionOpen = false;
205 dummy1->throwExceptionRead = false;
206
207 for(auto const* dev : {"Use1", "Use2", "Use12", "Use3"}) {
208 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
209 }
210}
211
212/**********************************************************************************************************************/
213
220 // Just check that Use1 and Use2 are do not share any backend IDs. That they are in the same recovery group is already
221 // tested in testExceptionHandling_a_5_1
222 auto& dm1 = testApp.singleDev1.dev.getDeviceManager();
223 auto& dm2 = testApp.singleDev2.dev.getDeviceManager();
224
225 auto ids1 = dm1.getDevice().getInvolvedBackendIDs();
226 auto ids2 = dm2.getDevice().getInvolvedBackendIDs();
227 for(auto id : ids1) {
228 BOOST_CHECK(!ids2.contains(id));
229 }
230}
231
232/**********************************************************************************************************************/
233
240 // Unfortunately we can only do a race condition test, and check that the race condition does not occur. So the test
241 // is insensitive for most of the time, but we still have a chance to see the error condition from time to time if it
242 // is there.
243 // Through the public API we cannot inject waiting code which block before the barrier, and we test that nothing has
244 // run behind the barrier yet (would be sensitive most of the time).
245 // So we take the first point after the barrier where we can block the code execution, and check that everything
246 // before the barrier has happened.
247
248 // Pre-condition: wait until all devices are ok
249 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
250 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
251 }
252 auto testLmap1 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
253 testApp.singleDev1.dev.getDeviceManager().getDevice().getBackend());
254 testLmap1->blockOpen = true;
255
256 testApp.singleDev1.dev.reportException("reported from TestDetectBarrier");
257
258 testLmap1->blockOpenArrivedBarrier.arrive_and_wait();
259 // We now know that the recovery is blocked, so the following test does not produce
260 // false positives because the recovery has already run through and cleared the error condition before we see it.
261
262 // The actual test:
263 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
264 // Test without timeout! The devices must NOW be in error state.
265 BOOST_TEST(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 1);
266 }
267
268 // Finish the recovery.
269 (void)testLmap1->blockOpenContinueBarrier.arrive();
270 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
271 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
272 }
273}
274
275/**********************************************************************************************************************/
276
283 // pre-condition: all devices in recovery group are OK
284 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
285 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
286 }
287 // set different value for the register written by the init handler, so we can see if the hander ran.
288 raw2.write<int32_t>("/MyModule/actuator", 16);
289
290 // Test preparation: Put backend 1 into an error state with read error.
291 auto dummy1 = boost::dynamic_pointer_cast<ctk::ExceptionDummy>(raw1.getBackend());
292 dummy1->throwExceptionOpen = true;
293 dummy1->throwExceptionRead = true;
294
295 trigger.write();
296 // wait until the errors have been seen.
297 for(auto const* dev : {"Use1", "Use2"}) {
298 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 1, 10000);
299 }
300
301 // Wait for the device 2 backend to become ok, so we know that the according DeviceManager has run the OPEN stage.
302 CHECK_TIMEOUT(raw2.isFunctional(), 10000);
303 usleep(100000); // Wait 100 ms for the init handler. It should not happen, so don't wait too long...
304
305 // The actual test: The init script of Use2 has not run.
306 BOOST_TEST(raw2.read<int32_t>("MyModule/actuator") == 16);
307
308 // Cleanup: Resolve the error and see that everything recovers.
309 dummy1->throwExceptionOpen = false;
310 dummy1->throwExceptionRead = false;
311
312 for(auto const* dev : {"Use1", "Use2"}) {
313 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
314 }
315}
316
317/**********************************************************************************************************************/
318
320 explicit BlockInitTestApp() : BasicTestApp("TestStepApp") {
323 }
324 ~BlockInitTestApp() override { shutdown(); }
325
326 // The execution of the first init functions can be blocked.
327 // The first init handler will run through, the second one will block.
328 std::atomic<bool> blockInit{false};
329 std::atomic<size_t> initCounter{0};
330 std::barrier<> arrivedInInitHandler{2};
331 void init1(const std::string& device) {
332 // cheap implementation with busy waiting
333 if(blockInit) {
334 if(++initCounter == 2) {
335 (void)arrivedInInitHandler.arrive();
336
337 while(blockInit) {
338 usleep(100);
339 }
340 }
341 }
342 ctk::Device d{device};
343 d.open();
344 d.write("/MyModule/actuator", 1);
345 }
346};
347
348/**********************************************************************************************************************/
355 // pre-condition: all devices in recovery group are OK
356 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
357 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
358 }
359
360 // While everything is functional, set values for some variables. They are restored during the recovery process.
361 testFacility.writeScalar<uint32_t>("/Use1/Integers/unsigned32", 17);
362 testFacility.writeScalar<uint32_t>("/Use2/Integers/unsigned32", 18);
363
364 // Wait until they arrived, the overwrite them and the values set in the init script.
365 CHECK_TIMEOUT(raw1.read<uint32_t>("/Integers/unsigned32") == 17, 10000);
366 CHECK_TIMEOUT(raw2.read<uint32_t>("/Integers/unsigned32") == 18, 10000);
367 raw1.write<uint32_t>("/Integers/unsigned32", 13);
368 raw2.write<uint32_t>("/Integers/unsigned32", 14);
369
370 // Block the init handler, set an error condition on 1 and trigger a read.
371 testApp.blockInit = true;
372 // in case something goes wrong in the test: make sure the process terminates
373 auto _ = cppext::finally([&]() { testApp.blockInit = false; });
374
375 auto dummy1 = boost::dynamic_pointer_cast<ctk::ExceptionDummy>(raw1.getBackend());
376 dummy1->throwExceptionOpen = true;
377 dummy1->throwExceptionRead = true;
378
379 trigger.write();
380 // wait until the errors have been seen.
381 for(auto const* dev : {"Use1", "Use2"}) {
382 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 1, 10000);
383 }
384
385 // Stage 2: Resolve the error.
386 dummy1->throwExceptionOpen = false;
387 dummy1->throwExceptionRead = false;
388
389 // wait until one init handler has run, and the other is blocking
390 testApp.arrivedInInitHandler.arrive_and_wait();
391 assert(testApp.initCounter == 2);
392 // We know one of the backends is closed when entering the init handler, so we have to re-open it.
393 // As we don't know which one, we just open both.
394 raw1.open();
395 raw2.open();
396
397 // The actual test: none of the recovery values has been written
398 BOOST_TEST(raw1.read<int32_t>("Integers/unsigned32") == 13);
399 BOOST_TEST(raw2.read<int32_t>("Integers/unsigned32") == 14);
400
401 // Stage 3: Release the blocking init handler and check that the device recovers.
402 testApp.blockInit = false;
403
404 for(auto const* dev : {"Use1", "Use2"}) {
405 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
406 }
407}
408
409/**********************************************************************************************************************/
411 explicit InitFailureApp() : BasicTestApp("InitFailureApp") {
414 }
415 ~InitFailureApp() override { shutdown(); }
416
417 // InitFunction to raise an error
418 std::atomic<bool> blockInitOnce{false}; // block the execution of at the start if all init handlers
419 std::barrier<> blockInitArrivedBarrier{2};
420 std::barrier<> blockInitContinueBarrier{2};
421 std::atomic<bool> failInit{false};
422 std::atomic<size_t> initCounter{0};
423 std::atomic<size_t> initSuccessCounter{0};
424 std::barrier<> aboutToFail{2}; // notify the test where we are. It has to do some checks
425 std::barrier<> proceedWithFail{2}; // wait for the test to complete its checks
426 void init() {
427 if(blockInitOnce) {
428 blockInitOnce = false;
429 (void)blockInitArrivedBarrier.arrive(); // notify the test
430 blockInitContinueBarrier.arrive_and_wait(); // only continue when testing is done
431 }
432 if(failInit) {
433 if(++initCounter == 2) {
434 // This branch will be only hit once because the counter is higher afterwards.
435 (void)aboutToFail.arrive(); // notify the test that it can do the preparation
436 proceedWithFail.arrive_and_wait(); // wait for the test to complete the preparation
437 throw ctk::runtime_error("Intentional failure in init()");
438 }
439 }
441 }
442};
443
444/**********************************************************************************************************************/
451 // Side effect: This test is also checking that the error condition of a failure in the init handler does
452 // not confuse the barrier order and lock up the manager (basically error handling smoke test).
453
454 // This test contains three checks:
455 // 1. *All* device managers restart the recovery.
456 // 2. The recovery restarts with *open* (not only the init step is repeated).
457 // 3. The recovery happens *after the POST-INIT-HANDLER barrier*.
458
459 // pre-condition: all devices in recovery group are OK
460 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
461 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
462 }
463
464 // preparation for check 3: Use1 is at the POST-INIT-HANDLER barrier, i.e. init handler is through, recovering write
465 // values is not.
466 testFacility.writeScalar<uint32_t>("/Use1/Integers/unsigned32", 17); // written by recovery write
467 testFacility.writeScalar<uint32_t>("/Use2/Integers/unsigned32", 18); // written by recovery write
468 testFacility.writeScalar<int16_t>("/Use12/Integers/signed16", 19); // written by recovery write
469 // Wait until the value arrived at the device, the overwrite
470 CHECK_TIMEOUT(raw1.read<uint32_t>("/Integers/unsigned32") == 17, 10000);
471 CHECK_TIMEOUT(raw2.read<uint32_t>("/Integers/unsigned32") == 18, 10000);
472 CHECK_TIMEOUT(raw1.read<int16_t>("/Integers/signed16") == 19, 10000);
473 raw1.write<uint32_t>("/Integers/unsigned32", 13);
474 raw2.write<uint32_t>("/Integers/unsigned32", 14);
475 raw1.write<int16_t>("/Integers/signed16", 15);
476
477 // Set the init script to fail and trigger an error condition.
478 testApp.failInit = true;
479 testApp.initSuccessCounter = 0;
480 testApp.singleDev1.dev.reportException("reported from TestInitFailure");
481
482 testApp.aboutToFail.arrive_and_wait();
483
484 // Check 3 part 1: One of the init handlers increased the initSuccessCounter, so we know it has
485 // run and due to the sleeps, we can be pretty sure it has arrived at the POST-INIT-HANDLER barrier.
486 BOOST_CHECK(testApp.initSuccessCounter == 1);
487
488 // Preparation. At this point we know that
489 // - One of the init handlers has run through.
490 // - The other init handler is waiting in the init handler
491 // - The successful init handler has a higher open count because the device is re-opened after the init handler
492 // But we don't know which device is in which state.
493 // So we store all open counters.
494 auto testLmap1 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
495 testApp.singleDev1.dev.getDeviceManager().getDevice().getBackend());
496 auto testLmap2 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
497 testApp.singleDev2.dev.getDeviceManager().getDevice().getBackend());
498 auto testLmap12 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
499 testApp.mappedDev12.dev.getDeviceManager().getDevice().getBackend());
500 size_t openCount1 = testLmap1->openCounter;
501 size_t openCount2 = testLmap2->openCounter;
502 size_t openCount12 = testLmap12->openCounter;
503
504 // Also block the execution of newly starting init handlers so we know that at this point only
505 // the open step has happened. This simplifies testing.
506 testApp.blockInitOnce = true;
507
508 // now make the second init handler throw
509 (void)testApp.proceedWithFail.arrive();
510
511 // Check 1 and 2: *All* DeviceManagers have *restarted* the recovery procedure.
512 // The restart of the recovery procedure is detected by looking at the open counter.
513 testApp.blockInitArrivedBarrier.arrive_and_wait();
514 BOOST_TEST(testLmap1->openCounter == openCount1 + 1);
515 BOOST_TEST(testLmap2->openCounter == openCount2 + 1);
516 BOOST_TEST(testLmap12->openCounter == openCount12 + 1);
517
518 // Check 3: The recover actually restarted after the POST-INIT-HANDLER and no write recovery was done.
519 // We know that one of the init handlers is blocking, so one of the devices is closed,
520 // but no futher close will happen while the init handler is blocking.
521 // Just reopen all devices.
522 raw1.open();
523 raw2.open();
524 BOOST_TEST(raw1.read<uint32_t>("/Integers/unsigned32") == 13);
525 BOOST_TEST(raw2.read<uint32_t>("/Integers/unsigned32") == 14);
526 BOOST_TEST(raw1.read<int16_t>("/Integers/signed16") == 15);
527
528 // Resolve the error condition and wait until everything has recovered.
529 testApp.failInit = false;
530 (void)testApp.blockInitContinueBarrier.arrive();
531 for(auto const* dev : {"Use1", "Use2"}) {
532 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
533 }
534}
535
536/**********************************************************************************************************************/
537
538// App used by TestRecoveryWriteBarrier and TestIncompleteWriteRecovery
540 explicit WriteRecoveryTestApp() : Application("RecoveryFailureTestApp") {}
542
543 ctk::SetDMapFilePath path{"recoveryGroups.dmap"};
544
545 // recovery group with Use1 and Use2
548 // Use the combining xlmap file which does not use write registers.
549 // The tests (TestRecoveryWriteBarrier, TestIncompleteWriteRecovery) require that there is only one register
550 // written on backend 2.
551 DeviceModuleWithPath mappedDev12{this, "Use12ReadOnly"};
552};
553
554/**********************************************************************************************************************/
555
563 // pre-condition: all devices OK
564 for(auto const* dev : {"Use1", "Use2", "Use12ReadOnly"}) {
565 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
566 }
567 // Write something to Use1 so we can check when its recovery accessor writing is through.
568 testFacility.writeScalar<uint32_t>("/Use1/Integers/unsigned32", 18);
569 CHECK_TIMEOUT(raw1.read<uint32_t>("Integers/unsigned32") == 18, 10000);
570 // Change the value on the device to detect when the recovery writing is through.
571 raw1.write("Integers/unsigned32", 0);
572
573 // Block Use2 and trigger a recovery
574 auto dummy1 = boost::dynamic_pointer_cast<WriteBlockingDummy>(raw1.getBackend());
575 auto dummy2 = boost::dynamic_pointer_cast<WriteBlockingDummy>(raw2.getBackend());
576 dummy2->blockWriteOnce = true;
577 testApp.singleDev2.dev.reportException("reported from TestRecoveryWriteBarrier");
578
579 // Wait until Use2 is blocking
580 dummy2->blockWriteArrivedBarrier.arrive_and_wait();
581
582 // store the Use1 open counters
583 auto lmapDummy1 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
584 testApp.singleDev1.dev.getDeviceManager().getDevice().getBackend());
585 size_t openCount1 = lmapDummy1->openCounter;
586
587 // The actual test:
588 // Even though Use1 has completed the recovery write step, it has not reset the internal error flag yet,
589 // so reporting an exception does not cause another recovery round (monitored by looking at the open counter
590 // after the successful recovery.
591 CHECK_TIMEOUT(raw1.read<uint32_t>("Integers/unsigned32") == 18, 10000); // recovery write step complete
592 testApp.singleDev1.dev.reportException("This exception should be suppressed.");
593 (void)dummy2->blockWriteContinueBarrier.arrive();
594 for(auto const* dev : {"Use1", "Use2", "Use12ReadOnly"}) {
595 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
596 }
597 BOOST_TEST(lmapDummy1->openCounter == openCount1);
598}
599
600/**********************************************************************************************************************/
601
609 // Side effect: This test is checking that the error condition of a failure when writing the recovery accessors do
610 // not confuse the barrier order and lock up the manager.
611
612 // This test contains three checks:
613 // 1. *All* device managers restart the recovery.
614 // 2. The recovery restarts with *open* (not only the init step is repeated).
615 // 3. The recovery happens *after the POST-WRITE-RECOVERY barrier*.
616
617 // pre-condition: all devices OK
618 for(auto const* dev : {"Use1", "Use2", "Use12ReadOnly"}) {
619 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
620 }
621
622 // Write something to Use1 so we can check when its recovery accessor writing is through.
623 testFacility.writeScalar<uint32_t>("/Use1/Integers/unsigned32", 18);
624 CHECK_TIMEOUT(raw1.read<uint32_t>("Integers/unsigned32") == 18, 10000);
625 // Change the value on the device to detect that the writing is through.
626 raw1.write("Integers/unsigned32", 0);
627
628 // create an error condition which throws when writing (the recovery accessors)
629 auto dummy2 = boost::dynamic_pointer_cast<WriteBlockingDummy>(raw2.getBackend());
630 dummy2->throwExceptionWrite = true;
631 dummy2->blockWriteOnce = true;
632 testApp.singleDev2.dev.reportException("reported from TestRecoveryWriteFailure");
633
634 // Wait until Use2 is blocking and Use1 has restored the write values.
635 dummy2->blockWriteArrivedBarrier.arrive_and_wait();
636 CHECK_TIMEOUT(raw1.read<uint32_t>("Integers/unsigned32") == 18, 10000);
637 // sleep a bit so we can be pretty sure that Use1 has arrived at the POST-WRITE-RECOVERY barrier
638 usleep(100000); // 100 ms
639
640 // Take a snapshot of the open counters for checks 1 and 2.
641 auto testLmap1 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
642 testApp.singleDev1.dev.getDeviceManager().getDevice().getBackend());
643 auto testLmap2 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
644 testApp.singleDev2.dev.getDeviceManager().getDevice().getBackend());
645 auto testLmap12 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
646 testApp.mappedDev12.dev.getDeviceManager().getDevice().getBackend());
647 size_t openCount1 = testLmap1->openCounter;
648 size_t openCount2 = testLmap2->openCounter;
649 size_t openCount12 = testLmap12->openCounter;
650
651 // preparation for check 3: recovery started directly after the POST-WRITE-RECOVERY barrier
652 // Get an asynchronous variable from Use1. It must not have seen any data after the exception
653 auto pushedSigned32 = testFacility.getScalar<int32_t>("/Use1/Integers/pushedSigned32");
654 pushedSigned32.readLatest(); // just empty the queue.
655 // The last thing we should have seen is the exception, so data validity is faulty.
656 BOOST_CHECK(pushedSigned32.dataValidity() == ctk::DataValidity::faulty);
657
658 // Now let Use2 continue and throw the write exception. Already request to
659 // stop at the next write.
660 dummy2->blockWriteOnce = true;
661 (void)dummy2->blockWriteContinueBarrier.arrive();
662 // Now the recovery should see an error and continue from the beginning.
663 // Wait again until Use2 blocks when writing.
664 dummy2->blockWriteArrivedBarrier.arrive_and_wait();
665
666 // Check 1 and 2: *All* DeviceManagers have *restarted* the recovery procedure.
667 // The restart of the recovery procedure is detected by looking at the open counter.
668 BOOST_TEST(testLmap1->openCounter == openCount1 + 1);
669 BOOST_TEST(testLmap2->openCounter == openCount2 + 1);
670 BOOST_TEST(testLmap12->openCounter == openCount12 + 1);
671
672 // Check 3: After seeing the exception in Use2, Use1 has not completed the
673 // recovery after the POST-WRITE-RECOVERY barrier and hence async read is not turned on yet.
674 // Wait a bit (100 ms) for data to arrive, but not too long as we don't expect anything.
675 usleep(100000);
676 BOOST_CHECK(!pushedSigned32.readNonBlocking());
677
678 // Finally, resolve the error condition and wait until everything recovers.
679 dummy2->throwExceptionWrite = false;
680 (void)dummy2->blockWriteContinueBarrier.arrive();
681 for(auto const* dev : {"Use1", "Use2", "Use12ReadOnly"}) {
682 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
683 }
684}
685
686/**********************************************************************************************************************/
687
693BOOST_AUTO_TEST_CASE(TestClearErrorBarrier) {
694 // This test is trying to provoke a rare race condition that existed (and should now be removed). If
695 // one backend has already activated the async read and had not waited at the POST-CLEAR-ERROR barrier,
696 // another thread might not have cleared the internal error condition yet and will discard the reported exception.
697 // In this case it will never reach the POST-DETECT barrier all DeviceManagers in this recovery group are stuck,
698 // because the others are waiting there.
699
700 BasicTestApp testApp;
701 ChimeraTK::TestFacility testFacility{testApp, /* enableTestableMode */ false};
702 auto pushed1 = testFacility.getScalar<int>(std::string("/Use1/Integers/pushedSigned32"));
703 testFacility.runApplication();
704
705 // Maximise the chance for the race condition:
706 // As soon as Use1 sends the initial value to the push type accessor, report an exception to Use12.
707 // If it misses it, no recovery is triggered and the open count is 1.
708 // If it got it, there was a recovery round and the open count is 2.
709 pushed1.read();
710 testApp.mappedDev12.dev.reportException("reported from TestClearErrorBarrier");
711
712 auto testLmap1 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
713 testApp.singleDev1.dev.getDeviceManager().getDevice().getBackend());
714 CHECK_TIMEOUT(testLmap1->openCounter == 2, 10000);
715
716 // Wait for recovery to complete.
717 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
718 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
719 }
720}
721
722/**********************************************************************************************************************/
723
730BOOST_AUTO_TEST_CASE(TestIncompleteRecoveryOpen) {
731 { // open a new scope so we can test after the app goes out of scope
732 BasicTestApp testApp;
733 ChimeraTK::TestFacility testFacility{testApp, /* enableTestableMode */ false};
734 testFacility.runApplication();
735
736 // pre-condition: all devices in recovery group are OK
737 for(auto const* dev : {"Use1", "Use2", "Use12"}) {
738 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
739 }
740
741 // Prepare throwing in thread_interrupted in open
743 auto testLmap2 = boost::dynamic_pointer_cast<OpenCountingLmapBackend>(
744 testApp.singleDev2.dev.getDeviceManager().getDevice().getBackend());
745 testLmap2->throwThreadInterrupted = true;
746 testApp.singleDev1.dev.reportException("reported from TestIncompleteRecoveryOpen");
747
748 // Wait until the dummy backend told us it is about to throw.
749 testLmap2->aboutToThrowArrivedBarrier.arrive_and_wait();
750
751 // wait until the other DeviceManager has opened its backend, then sleep a bit to be pretty sure
752 // it has reached the barrier
753 CHECK_TIMEOUT(testApp.singleDev1.dev.getDeviceManager().getDevice().isFunctional(), 10000);
754 usleep(100000);
755
756 // once we let dummy2 continue it will throw.
757 (void)testLmap2->aboutToThrowContinueBarrier.arrive();
758 }
759 // The actual test: We reached this point, the test did not block
760 BOOST_CHECK(true);
761}
762
763/**********************************************************************************************************************/
764
766 explicit IncompleteRecoveryTestApp() : Application("IncompleteRecoveryTestApp") {
769 }
771
772 ctk::SetDMapFilePath path{"recoveryGroups.dmap"};
773
774 std::atomic<bool> throwInInit{false};
775 std::atomic<size_t> initCounter{0};
776 std::barrier<> aboutToThrow{2};
777 void init() {
778 // cheap implementation with busy waiting
779 if(throwInInit) {
780 if(++initCounter == 2) {
781 // The other init handler has passed this point already. Wait a bit to be pretty sure it has reached
782 // the INIT_HANDLER barrier.
783 usleep(100000); // 100 ms
784
785 // Tell the test thread that we are here, about to throw the exception
786 aboutToThrow.arrive_and_wait();
787
788 // Jump out of the DeviceManager main loop with a thread_interrupted exception, just like all other
789 // breadpoints do
790 throw boost::thread_interrupted(); // NOLINT hicpp-exception-baseclass
791 }
792 }
793 }
794
795 // recovery group with Use1 and Use2
798 DeviceModuleWithPath mappedDev12{this, "Use12ReadOnly"};
799};
800
801/**********************************************************************************************************************/
802
809BOOST_AUTO_TEST_CASE(TestIncompleteRecoveryInit) {
810 { // open a new scope so we can test after the app goes out of scope
812 ChimeraTK::TestFacility testFacility{testApp, /* enableTestableMode */ false};
813 testFacility.runApplication();
814
815 // pre-condition: all devices in recovery group are OK
816 for(auto const* dev : {"Use1", "Use2", "Use12ReadOnly"}) {
817 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
818 }
819
820 testApp.throwInInit = true;
821 testApp.singleDev1.dev.reportException("reported from TestIncompleteRecoveryInit");
822
823 // Wait until the init handler which will throw told us it has reached that point, so we don't end the application
824 // scope before the test is sensitive.
825 // The second init handler which is run does the blocking, and sleeps a bit before arriving here, so we are pretty
826 // sure that the other init handler has reached the barrier.
827 testApp.aboutToThrow.arrive_and_wait();
828 }
829 // The actual test: We reached this point, the test did not block
830 BOOST_CHECK(true);
831}
832
833/**********************************************************************************************************************/
834
841BOOST_AUTO_TEST_CASE(TestIncompleteWriteRecovery) {
842 { // open a new scope so we can test after the app goes out of scope
843
844 // Use the WriteRecoveryTestApp with Use12ReadOnly because we again require that only Use2 is writing to the backend
845 WriteRecoveryTestApp testApp;
846 ChimeraTK::TestFacility testFacility{testApp, /* enableTestableMode */ false};
847 testFacility.runApplication();
848
849 // pre-condition: all devices in recovery group are OK
850 for(auto const* dev : {"Use1", "Use2", "Use12ReadOnly"}) {
851 CHECK_TIMEOUT(testFacility.readScalar<int>(std::string("Devices/") + dev + "/status") == 0, 10000);
852 }
853
854 ctk::Device raw1{"Raw1"};
855 raw1.open();
856
857 // Write something to Use1 so we can check when its recovery accessor writing is through.
858 testFacility.writeScalar<uint32_t>("/Use1/Integers/unsigned32", 18);
859 CHECK_TIMEOUT(raw1.read<uint32_t>("Integers/unsigned32") == 18, 10000);
860 // Change the value on the device to detect that the writing is through.
861 raw1.write("Integers/unsigned32", 0);
862
863 // Prepare throwing in thread_interrupted in the read recovery
864 ctk::Device raw2{"Raw2"};
865 raw2.open();
866 auto dummy2 = boost::dynamic_pointer_cast<WriteBlockingDummy>(raw2.getBackend());
867 dummy2->blockWriteOnce = true;
868 dummy2->throwThreadInterrupted = true;
869 testApp.singleDev1.dev.reportException("reported from TestIncompleteWriteRecovery");
870
871 // Wait until the dummy backend told us it is about to throw.
872 dummy2->blockWriteArrivedBarrier.arrive_and_wait();
873
874 // wait until the other DeviceManager has written its values, then sleep a bit to be pretty sure
875 // it has reached the barrier
876 CHECK_TIMEOUT(raw1.read<uint32_t>("Integers/unsigned32") == 18, 10000);
877 usleep(100000);
878
879 // once we let dummy2 continue it will throw.
880 (void)dummy2->blockWriteContinueBarrier.arrive();
881 }
882 // The actual test: We reached this point, the test did not block
883 BOOST_CHECK(true);
884}
885
886/**********************************************************************************************************************/
void shutdown() override
This will remove the global pointer to the instance and allows creating another instance afterwards.
Device & getDevice()
Return the underlying ChimeraTK::Device object.
void addInitialisationHandler(std::function< void(ChimeraTK::Device &)> initialisationHandler)
void reportException(std::string errMsg)
Use this function to report an exception.
DeviceManager & getDeviceManager()
Return the corresponding DeviceManager.
friend class Application
Definition ModuleGroup.h:47
ModuleGroup()=default
Default constructor to allow late initialisation of module groups.
Helper class to set the DMAP file path.
Helper class to facilitate tests of applications based on ApplicationCore.
ChimeraTK::VoidRegisterAccessor getVoid(const ChimeraTK::RegisterPath &name) const
Obtain a void process variable from the application, which is published to the control system.
void runApplication() const
Start the application in testable mode.
InvalidityTracer application module.
DeviceModuleWithPath mappedDev12
BasicTestApp(const std::string &name="BasicTestApp")
ctk::SetDMapFilePath path
DeviceModuleWithPath singleDev3
DeviceModuleWithPath singleDev1
DeviceModuleWithPath singleDev2
~BasicTestApp() override
void init1(const std::string &device)
std::atomic< bool > blockInit
std::atomic< size_t > initCounter
std::barrier arrivedInInitHandler
DeviceModuleWithPath(ModuleGroup *owner, std::string const &cdd)
ctk::VoidRegisterAccessor trigger
ctk::Device raw1
ChimeraTK::TestFacility testFacility
ctk::Device raw2
DeviceModuleWithPath singleDev1
DeviceModuleWithPath singleDev2
std::atomic< size_t > initCounter
DeviceModuleWithPath mappedDev12
std::atomic< bool > throwInInit
std::barrier blockInitArrivedBarrier
std::atomic< bool > blockInitOnce
std::atomic< bool > failInit
std::barrier proceedWithFail
std::barrier blockInitContinueBarrier
std::atomic< size_t > initSuccessCounter
std::atomic< size_t > initCounter
std::barrier aboutToFail
std::barrier aboutToThrowArrivedBarrier
static boost::shared_ptr< DeviceBackend > creatorFunction(std::string, std::map< std::string, std::string > parameters)
std::atomic< size_t > openCounter
static std::atomic< size_t > globalOpenCounter
std::atomic< bool > throwThreadInterrupted
std::atomic< bool > blockOpen
std::barrier aboutToThrowContinueBarrier
static boost::shared_ptr< DeviceBackend > creatorFunction(std::string, std::map< std::string, std::string > parameters)
void write(uint64_t bar, uint64_t address, int32_t const *data, size_t sizeInBytes) override
std::atomic< bool > blockWriteOnce
std::barrier blockWriteContinueBarrier
std::atomic< bool > throwThreadInterrupted
std::barrier blockWriteArrivedBarrier
ctk::SetDMapFilePath path
DeviceModuleWithPath mappedDev12
DeviceModuleWithPath singleDev2
DeviceModuleWithPath singleDev1
constexpr std::string_view cdd
#define CHECK_TIMEOUT(condition, maxMilliseconds)
BOOST_FIXTURE_TEST_CASE(TestRecoveryGroups, Fixture< BasicTestApp >)
A.5 DeviceManagers with at least one common involved backend ID (see DeviceBackend::getInvolvedBacken...
BOOST_AUTO_TEST_CASE(TestClearErrorBarrier)
B.3.2.4.1 DeviceManagers wait until all involved DeviceManagers clear their internal error before fla...