diff --git a/CITATION b/CITATION index 3139b19..9c29544 100644 --- a/CITATION +++ b/CITATION @@ -1,3 +1,3 @@ Please cite as: -Ed Bennett, Lester Hedges, Matt Williams, "Introduction to automated testing and continuous integration in Python" +Ed Bennett, Lester Hedges, Julian Lenz, Matt Williams, "Introduction to automated testing and continuous integration in Python" diff --git a/_episodes/02-pytest-functionality.md b/_episodes/02-pytest-functionality.md index af90e2c..7f7a238 100644 --- a/_episodes/02-pytest-functionality.md +++ b/_episodes/02-pytest-functionality.md @@ -36,7 +36,7 @@ Lets add a second test to check a different set of inputs and outputs to the ~~~ from arrays import add_arrays -def test_add_arrays1(): +def test_add_arrays_positive(): a = [1, 2, 3] b = [4, 5, 6] expect = [5, 7, 9] @@ -45,7 +45,7 @@ def test_add_arrays1(): assert output == expect -def test_add_arrays2(): +def test_add_arrays_negative(): a = [-1, -5, -3] b = [-4, -3, 0] expect = [-5, -8, -3] @@ -73,8 +73,8 @@ rootdir: /home/matt/projects/courses/software_engineering_best_practices plugins: requests-mock-1.8.0 collected 2 items -test_arrays.py::test_add_arrays1 PASSED [ 50%] -test_arrays.py::test_add_arrays2 PASSED [100%] +test_arrays.py::test_add_arrays_positive PASSED [ 50%] +test_arrays.py::test_add_arrays_negative PASSED [100%] ==================== 2 passed in 0.07s ===================== ~~~ @@ -166,6 +166,57 @@ test_arrays.py::test_add_arrays[a1-b1-expect1] PASSED [100%] We see that both tests have the same name (`test_arrays.py::test_add_arrays`) but each parametrization is differentiated with some square brackets. +Unfortunately, in the current form this differentiation is not very helpful. If +you run this test later, you might not remember what `a0-b0-expect0` means, let +alone the precise numbers or the motivation for choosing them. Were that the +positive inputs or the negative ones? Did I choose them after fixing a +particular bug or because it is an important use case or were those just random +numbers? + +Luckily, we are not the first ones to realise that the above form of +parametrization misses the expressiveness of explicit function names. That's why +there is an additional `ids` keyword argument: The following code + +~~~ +import pytest + +from arrays import add_arrays + +@pytest.mark.parametrize("a, b, expect", [ + ([1, 2, 3], [4, 5, 6], [5, 7, 9]), + ([-1, -5, -3], [-4, -3, 0], [-5, -8, -3]), + ids=['positve','negative'] +]) +def test_add_arrays(a, b, expect): + output = add_arrays(a, b) + + assert output == expect +~~~ +{: .language-python} + +now results in the significantly more expressive + +~~~ +=================== test session starts ==================== +platform linux -- Python 3.8.5, pytest-6.0.1, py-1.9.0, pluggy-0.13.1 -- /usr/bin/python3 +cachedir: .pytest_cache +rootdir: /home/matt/projects/courses/software_engineering_best_practices +plugins: requests-mock-1.8.0 +collected 2 items + +test_arrays.py::test_add_arrays[positive] PASSED [ 50%] +test_arrays.py::test_add_arrays[negative] PASSED [100%] + +==================== 2 passed in 0.03s ===================== +~~~ +{: .output} + +If the arguments are better representable as a string than our example with +lists here, `pytest` often does a reasonably good job in generating `ids` +automatically from the values (we will see some examples of this in the next +section). But this still lacks the intentional communication that is associated +with manually chosen `ids`, so we strongly recommend to use `ids` in all but the +most trivial cases. > ## More parameters > @@ -185,6 +236,7 @@ but each parametrization is differentiated with some square brackets. >> ([-1, -5, -3], [-4, -3, 0], [-5, -8, -3]), # Test zeros >> ([41, 0, 3], [4, 76, 32], [45, 76, 35]), # Test larger numbers >> ([], [], []), # Test empty lists +>> ids=["positive", "negative", "larger numbers", "empty lists"] >> ]) >> def test_add_arrays(a, b, expect): >> output = add_arrays(a, b) @@ -195,7 +247,6 @@ but each parametrization is differentiated with some square brackets. > {: .solution} {: .challenge} - ## Failing correctly The interface of a function is made up of the _parameters_ it expects and the @@ -280,6 +331,7 @@ from arrays import add_arrays @pytest.mark.parametrize("a, b, expect", [ ([1, 2, 3], [4, 5, 6], [5, 7, 9]), ([-1, -5, -3], [-4, -3, 0], [-5, -8, -3]), + ids=["positive", "negative"] ]) def test_add_arrays(a, b, expect): output = add_arrays(a, b) @@ -307,8 +359,8 @@ rootdir: /home/matt/projects/courses/software_engineering_best_practices plugins: requests-mock-1.8.0 collected 3 items -test_arrays.py::test_add_arrays[a0-b0-expect0] PASSED [ 33%] -test_arrays.py::test_add_arrays[a1-b1-expect1] PASSED [ 66%] +test_arrays.py::test_add_arrays[positive] PASSED [ 33%] +test_arrays.py::test_add_arrays[negative] PASSED [ 66%] test_arrays.py::test_add_arrays_error PASSED [100%] ==================== 3 passed in 0.03s ===================== @@ -326,6 +378,7 @@ test_arrays.py::test_add_arrays_error PASSED [100%] >> @pytest.mark.parametrize("a, b, expected_error", [ >> ([1, 2, 3], [4, 5], ValueError), >> ([1, 2], [4, 5, 6], ValueError), +>> ids=['second shorter','first shorter'] >> ]) >> def test_add_arrays_error(a, b, expected_error): >> with pytest.raises(expected_error): @@ -354,6 +407,7 @@ test_arrays.py::test_add_arrays_error PASSED [100%] >> ([6], [3], [2]), # Test single-element lists >> ([1, 2, 3], [4, 5, 6], [0.25, 0.4, 0.5]), # Test non-integers >> ([], [], []), # Test empty lists +>> ids=["int", "negative int", "single-element", "non-int", "empty lists"] >> ]) >> def test_divide_arrays(a, b, expect): >> output = divide_arrays(a, b) @@ -365,6 +419,7 @@ test_arrays.py::test_add_arrays_error PASSED [100%] >> ([1, 2, 3], [4, 5], ValueError), >> ([1, 2], [4, 5, 6], ValueError), >> ([1, 2, 3], [0, 1, 2], ZeroDivisionError), +>> ids=['second shorter', 'first shorter', 'zero division'] >> ]) >> def test_divide_arrays_error(a, b, expected_error): >> with pytest.raises(expected_error): diff --git a/_episodes/03-fixtures.md b/_episodes/03-fixtures.md index 1d4658f..89cc0c3 100644 --- a/_episodes/03-fixtures.md +++ b/_episodes/03-fixtures.md @@ -414,7 +414,8 @@ being done once. > behaviour of the tests, and pytest prioritises correctness of the tests over > their performance. > -> What sort of behavior would functions have that failed in this way? +> What sort of behavior would functions have that failed in this way? Can you +> come up with example code for this? > >> ## Solution >> @@ -425,6 +426,80 @@ being done once. >> >> Fixtures should only be re-used within groups of tests that do not mutate >> them. +>> +>> ~~~ +>> @pytest.fixture(scope="session") +>> def initially_empty_list(): +>> return [] +>> +>> +>> @pytest.mark.parametrize("letter", ["a", "b", "c"]) +>> def test_append_letter(initially_empty_list, letter): +>> initially_empty_list.append(letter) +>> assert initially_empty_list == [letter] +>> ~~~ +>> {:. language-python} +> {: .solution} +{: .challenge} + +> ## Better ways to (unit) test +> +> The above example was explicitly constructed to acquire an expensive resource +> and exhibit a big advantage when using a fixture but is it actually a good +> way to test the `word_counts` function? Think about what the `word_counts` is +> supposed to do. Do you need a whole book to test this? +> +> List advantages and disadvantages of the above approach. Then, come up with +> another way of testing it that cures the disadvantages (maybe also loosing +> some of the advantages). Is your approach simpler and less error-prone? +> +> It is safe to assume that whenever to test such a function, it is supposed to +> be used in a larger project. Can you think of a test scenario where the +> original method is the best? +> +>> ## Solution +>> +>> The `word_counts` function is designed to count words in any string. It does +>> not need a whole book to test counting, so we could have also used tiny test +>> strings like `""`, `"hello world"`, `"hello, hello world"` to test all +>> functionality of `word_counts`. In fact, the original approach has a number +>> of disadvantages: +>> +>> * It is (time) expensive because it needs to download the book every time the +>> test suite is run. (2s for a test is a very long time if you want to run +>> that a test suite of hundreds of those every few minutes.) +>> * It is brittle regarding various aspects: +>> - If you don't have an internet connection, your test fails. +>> - If the URL changes, your test fails. +>> - If the content changes, your test fails (we had that a few times). +>> * It is very obscure because you cannot know if the numbers we have given you +>> are correct. Maybe the function has a bug that we don't know about because +>> admittedly we also just used the output of that function to generate our +>> test cases. +>> +>> The one big advantage of the above is that you are using realistic test data. +>> As opposed to the string `"hello world"`, the book likely contains a lot of +>> different words, potentially different capitalisation and spellings, +>> additional punctuation and maybe special characters that your function may or +>> may not handle correctly. You might need a lot of different test strings to +>> cover all these cases (and combinations thereof). +>> +>> The alternative approach with tiny test strings cures all of the above +>> listed disadvantages and the tests will be easy to read, understand and +>> verify particularly if you use expressive test function names and parameters +>> `ids`. This is the best way to write a unit test, i.e. a test that is +>> concerned with this single unit of functionality in isolation and will likely +>> be run hundreds of times during a coding session. +>> +>> Nevertheless, in a bigger project you would want to have other kinds of +>> tests, too. The `word_counts` functionality will probably be integrated into +>> a larger aspect of functionality, e.g., a statistical analysis of books. In +>> such a case, it is equally important to test that the integration of the +>> various individually tested units worked correctly. Such integration tests +>> will be run less often than unit tests and might be more meaningful for more +>> realistic circumstances. For such -- and definitely for the even broader +>> end-to-end tests that run a whole program from the (simulated) user input to +>> a final output -- the original approach is well-suited. > {: .solution} {: .challenge} diff --git a/_episodes/04-edges.md b/_episodes/04-edges.md index 7b4228c..f19270b 100644 --- a/_episodes/04-edges.md +++ b/_episodes/04-edges.md @@ -114,7 +114,7 @@ def test_left_edge(): assert c.neighbours() == 3 # Check the coordinates of the neighbours. - assert c.left() == None + assert c.left() is None assert c.right() == (1, 2) assert c.up() == (0, 3) assert c.down() == (0, 1) @@ -146,10 +146,10 @@ def test_bottom_left_corner(): assert c.neighbours() == 2 # Check the coordinates of the neighbours. - assert c.left() == None + assert c.left() is None assert c.right() == (1, 0) assert c.up() == (0, 1) - assert c.down() == None + assert c.down() is None ~~~ {: .language-python} diff --git a/_episodes/05-randomness.md b/_episodes/05-randomness.md index 4e40863..a3eab52 100644 --- a/_episodes/05-randomness.md +++ b/_episodes/05-randomness.md @@ -173,11 +173,14 @@ that it is relatively bug-free for the cases we've tested for. Of course, so far we've only tested 6-sided dice—we have no guarantee that it works for other numbers of sides, yet. -You can extend this approach to any programming problem where you don't know the -exact answer up front, including those that are random and those that are just -exploratory. Start by focusing on what you do know, and write tests for that. As -you understand more what the expected results are, you can expand the test -suite. +The important upshot of this approach is that despite the fact that we could not +predict the exact return value of our function, we were still able to test for +exactly known invariants and guarantees upheld by it. You can extend this +approach to any programming problem where the exact return value of a function +cannot be meaningfully tested for, including those that are random or out of +your control and those that are just exploratory. Start by focusing on what you +do know, and write tests for that. As you understand more what the expected +results are, you can expand the test suite. > ## Two six-sided dice > diff --git a/_episodes/06-continuous-integration.md b/_episodes/06-continuous-integration.md index 67c7011..03e3033 100644 --- a/_episodes/06-continuous-integration.md +++ b/_episodes/06-continuous-integration.md @@ -338,6 +338,23 @@ next to the first commit, a green tick (passed) next to the second, and nothing > check all code against a defined house style (for example, PEP 8). {: .callout} +> ## pre-commit +> +> Another helpful developer tool somewhat related to CI is +> [pre-commit][pre-commit] (or more generally `git` hooks). They allow to +> perform certain actions locally when triggered by various `git` related events +> like before or after a commit, merge, push, etc. A standard use-case is +> running automated formatters or code linters before every commit/push but +> other things are possible, too, like updating a version number. One major +> difference with respect to CI is that each developer on your team has to +> manually install the hooks themselves and, thus, could choose to not do so. As +> opposed to a CI in a central repository, `git` hooks are therefore not capable +> of enforcing anything but are a pure convenience for the programmer while CI +> could be used to reject pushes or pull requests automatically. Furthermore, +> you are supposed to commit often and, hence, committing should be a fast and +> lightweight action. Therefore, the pre-commit developers explicitly discourage +> running expensive test suites as a pre-commit hook. +> {: .callout} > ## Try it yourself > @@ -366,3 +383,4 @@ next to the first commit, a green tick (passed) next to the second, and nothing [pypi]: https://pypi.org [starter-workflows]: https://github.com/actions/starter-workflows [yaml]: https://en.wikipedia.org/wiki/YAML +[pre-commit]: https://pre-commit.com diff --git a/_episodes/07-coverage.md b/_episodes/07-coverage.md index 57be482..30f6d6d 100644 --- a/_episodes/07-coverage.md +++ b/_episodes/07-coverage.md @@ -98,6 +98,8 @@ the consistency checks in the `__init__()` method of `Cell`, and methods such as methods) to have at least one test, so this test suite would benefit from being expanded. +> ## How much coverage do I need? +> > It's worth pointing out again that 100% coverage is not essential for a good > test suite. If the coverage is below 100%, then that indicates that it's worth > understanding where the uncovered lines of code are, and whether it is worth @@ -116,6 +118,22 @@ expanded. > between projects. {: .callout} +> ## Configuring `coverage` +> +> `coverage` and `pytest-cov` are configurable via a toml file called +> `.coveragerc` by default. Various details about behaviour and output can be +> adjusted there. Most notably, explicit exceptions can be defined that exclude +> certain files, blocks or lines from the coverage report. +> +> This is useful in various situation; you can, e.g., exclude the test files +> from the coverage report to reduce noise or change the commandline output. +> +> Another opinionated idea is to indeed aim for 100% code coverage but +> explicitly exclude what you consider unimportant in your testing. While +> opponents say that is just cheating, you had to make a concious decision to +> exclude a piece of code and explicitly documented it in a file (with a comment +> explaining the decision in the best case). +{: .callout} ## Coverage and continuous integration diff --git a/_episodes/08-exercise.md b/_episodes/08-exercise.md index 691242e..5654f70 100644 --- a/_episodes/08-exercise.md +++ b/_episodes/08-exercise.md @@ -1,12 +1,14 @@ --- title: "Putting it all together" -teaching: 5 +teaching: 10 exercises: 90 questions: - "How can I apply all of these techniques at once to a real application?" objectives: - "Be able to apply testing and CI techniques to a piece of research software." keypoints: +- "Tests can have very different purposes and you should keep in mind the broad + applicability of automated testing." - "Testing and CI work well together to identify problems in research software and allow them to be fixed quickly." - "If anything is unclear, or you get stuck, please ask for help!" --- @@ -15,6 +17,87 @@ Now we have developed a range of skills relating to testing and continuous integration, we can try putting them into practice on a real piece of research software. +But before we do so, let us quickly recap what we learnt today. + +## The purpose of a test + +When you write a test you do this to gain (and in the case of automated tests +maintain) confidence into the correctness of your code. But in detail your tests +can serve a variety of purposes. It can be useful to keep in mind what you could +use tests for during your coding, so we compiled a certainly non-exhaustive list +of test purposes here. The major ones were discussed in the lesson; some more +exotic ones should be seen as suggestions for you to try. A test can have more +than one of these purposes: + +* **test for features** - The first and simplest test that is capable of verifying + the correctness of any feature (in the broadest sense) of your code. This is + the obvious purpose of a test and encountered everywhere in the lesson. +* **test for confidence (in a narrow sense)** - Additional tests of the same + features that are redundant in that they repeat a test for features with + qualitatively similar input just to double-check. This is also encountered in + the lesson, e.g. in the solution of "More parameters" in [pytest + features][pytest_features] large numbers aren't really that different from + other numbers unless you run into overflow errors (one could even argue that + testing the negative numbers in [pytest features][pytest_features] is not + qualitatively different and rather to double check). One should be aware of + the difference between testing for confidence and necessary feature tests. The + former is a convenience that comes at the cost of longer test runs and so it + is not always desirable to test redundantly (although certainly better than + missing an aspect). +* **test for edge-/corner-cases** - Test special input or conditions for which a + general algorithm needs to be specialised (e.g., NaNs, infinities, overflows, + empty input, etc.). We did a [whole episode][edge_cases] on this. +* **test for failures** - This is part of feature testing but important enough to + mention explicitly: The conditions under which your code fails are part of + your interface and need to be tested. The user (that probably includes + yourself) might rely on a raised exception or returned default value in the + case of failure. Make sure they can and think of all the cases that your + current approach cannot handle. Any changes in these (even those for the + better) are changes of the interface and should appear intentionally. This was + discussed in [pytest features][pytest_features]. +* **fuzzy testing** - This is broader than testing for failures; if you have + unexperienced or even malicious users of your project, they might run your + code with inputs or under conditions that do not make any sense at all and are + almost impossible to predict. Fuzzy testing is a strategy where you let the + computer run your code with random input (sometimes down to the bit level) and + make sure that not even the most far-fetched input can break your code. There + are libraries for that, so you don't have to set up all the boilerplate + yourself. +* **regression test** - After you have found a bug, you can write a test reproducing + the precise conditions under which the bug appeared. Once you fixed it, your + test will work fine and if a later change risks introducing this bug again, you + can rest assured that it will be immediately signalled by a failing test. +* **test as a reminder** - In most contemporary test frameworks, you can mark a test + as an "expected failure". Such tests are run during your standard test runs + but the test framework will complain if they don't fail. This can be a + convenient way of marking a to-do or a known bug that you don't have time to + fix at the moment. It will preserve your precise intention, e.g., the precise + conditions of the bug in code form and it might be an important information if + a bug disappeared unexpectedly. Maybe another code change had an effect you + did not intend? +* **test for fixing an external interface** - You can even test code did not write + yourself. If you rely on a particular library, you don't have control over the + evolution of that library, so it can be a good idea to write a few test cases + that just use the interface of that library as you do it in your code. If they + ever change or deprecate something about that interface, you don't have to + chase down a rabbit hole of function calls to get the bottom of that but + instead have a (hopefully well-named) test that immediately signals where the + problem lies. +* **test for learning an external interface** - When you start using a new library, + you might play around with it for a while before using it in production just + to learn how it's used. Why not preserve this in automated tests? You have the + same effect as if you, e.g., wrote a script or used an interactive session but + you can come back and have a look at it again later. Also, you immediately fix + the external interface (see previous item). + +This is list certainly not something you want to implement as a whole. Some of +the purposes might simply not apply (e.g. fuzzy testing if you don't have +external users) or might not be worth the extra effort (e.g. fixing an external +interface that is expected to be very stable). But you might find yourself in a +situation where some of these are appropriate tools for your problem and you +might want to come back from time to time and refresh your memory. That said, +let's dive into the final exercise. + ## The software We are going to work with `pl_curves`, a piece of research software developed by @@ -34,10 +117,16 @@ the different bacteria are. It already has tests written for most functions. > the badges. > 4. Create a virtual environment on your computer for the project, and install > the project's requirements, so you can run the test suite locally. -> 5. Currently, some of the tests for the repository fail. Work out why this is +> 5. The current code is very outdated by now and you will see in a moment that +> it does not work with a standard contemporary python installation anymore. +> Assuming for a moment the tests would not exist, how would you feel about +> the task of updating the code to run _correctly_ on a modern machine? Where +> would you start? How confident would you feel that each and every line of +> code works as intended? +> 6. Now we turn to the tests. Some of them fail currently. Work out why this is > happening, and fix the issues. Check that they are fixed in the CI workflow > as well. -> 6. Currently, the code is only tested for Python versions up to 3.6. Since +> 7. Currently, the code is only tested for Python versions up to 3.6. Since > Python has moved on now, add 3.7, 3.8 and 3.9 as targets for the CI. Do the > tests pass now? If not, identify what has caused them to fail, and fix the > issues you identify. This is an important reason for having a test suite: @@ -45,12 +134,12 @@ the different bacteria are. It already has tests written for most functions. > Without a test suite, you don't know whether this has happened until > someone points out that your new results don't match your older ones! > Having CI set up allows easy testing of multiple different versions. -> 7. Currently the code is being tested against Ubuntu 18.04 (released April 2018). +> 8. Currently the code is being tested against Ubuntu 18.04 (released April 2018). > A new long term support release of Ubuntu came out in April 2020 (version 20.04). > Upgrade the operating system being tested from Ubuntu 18.04 to Ubuntu 20.04. > As with upgrading Python, the test suite helps us check that the code still > runs on a newer operating system. -> 8. Upgrade to the most recent version of Pandas. Again, see if this breaks +> 9. Upgrade to the most recent version of Pandas. Again, see if this breaks > anything. If it does, then fix the issues, and ensure that the test suite > passes again. > @@ -64,3 +153,5 @@ the different bacteria are. It already has tests written for most functions. [pl-curves]: https://github.com/CDT-AIMLAC/pl_curves +[pytest_features]: https://edbennett.github.io/python-testing-ci/02-pytest-functionality/index.html +[edge_cases]: https://edbennett.github.io/python-testing-ci/04-edges/index.html diff --git a/files/code-testing.zip b/files/code-testing.zip index f946700..cf5e745 100644 Binary files a/files/code-testing.zip and b/files/code-testing.zip differ