operator: q-learn

    description:
        q-learn[iterations, alpha, gamma, op] sp
        q-learn with respect to states given by op
        and sp is the set of final/goal states

    examples:
        -- load q-learning-example.sw
        -- data from here: http://mnemstudio.org/path-finding-q-learning-tutorial.htm
        step |0> => |4>
        step |1> => |3> + |5>
        step |2> => |3>
        step |3> => |1> + |2> + |4>
        step |4> => |0> + |3> + |5>
        step |5> => |1> + |4> + |5>
        
        reward |0> => |0>
        reward |1> => |0>
        reward |2> => |0>
        reward |3> => |0>
        reward |4> => |0>
        reward |5> => |100>

        -- learn the Q and norm-Q values:
        q-learn[1000, 1, 0.8, step] |5>
        
        -- now display the results in a table:        
        table[transition, norm-Q] ket-sort rel-kets[norm-Q]

            +------------+--------+
            | transition | norm-Q |
            +------------+--------+
            | 0 -> 4     | 80     |
            | 1 -> 3     | 64.0   |
            | 1 -> 5     | 100    |
            | 2 -> 3     | 64.0   |
            | 3 -> 1     | 80     |
            | 3 -> 2     | 51.2   |
            | 3 -> 4     | 80     |
            | 4 -> 0     | 64.0   |
            | 4 -> 3     | 64.0   |
            | 4 -> 5     | 100    |
            | 5 -> 1     | 80     |
            | 5 -> 4     | 80     |
            | 5 -> 5     | 100.0  |
            +------------+--------+

        -- next example, data from here (see the cheese example): 
        -- https://medium.freecodecamp.org/diving-deeper-into-reinforcement-learning-with-q-learning-c18d0db58efe
        -- load q-learning-example-2.sw
        
        step |A> => |B> + |D>
        step |B> => |A> + |E> + |C>
        step |C> => |B> + |F>
        step |D> => |A> + |E>
        step |E> => |E>
        step |F> => |E> + |C> + |F>
        
        reward |A> => |0>
        reward |B> => |1>
        reward |C> => |0>
        reward |D> => |2>
        reward |E> => |-10>
        reward |F> => |10>

        -- q-learn[iterations, alpha, gamma, op] set-of-terminal-states:
        |null> => q-learn[1000, 1, 0.8, step] (|E> + |F>)
        
        -- now display the transition table:       
        table[transition, norm-Q] ket-sort rel-kets[norm-Q]

            +------------+--------+
            | transition | norm-Q |
            +------------+--------+
            | A -> B     | 66.0   |
            | A -> D     | 46.24  |
            | B -> A     | 52.8   |
            | B -> C     | 80     |
            | B -> E     | -20.0  |
            | C -> B     | 66.0   |
            | C -> F     | 100    |
            | D -> A     | 52.8   |
            | D -> E     | -20.0  |
            | E -> E     | -20.0  |
            | F -> C     | 80.0   |
            | F -> E     | -20.0  |
            | F -> F     | 100.0  |
            +------------+--------+

        -- show the walk sequences:
        walk |*> #=> q-walk |_self>
        table[start, walk] rel-kets[step]

            +-------+-------------------+
            | start | walk              |
            +-------+-------------------+
            | A     | A . B . C . F     |
            | B     | B . C . F         |
            | C     | C . F             |
            | D     | D . A . B . C . F |
            | E     | E                 |
            | F     | F                 |
            +-------+-------------------+

    see also:
        q-walk, q-learning-example.sw, q-learning-example-2.sw
Home