`
dcaoyuan
  • 浏览: 306420 次
社区版块
存档分类
最新评论

A Simple XML State Machine Accepting SAX Events to Build xmerl Compitable XML Tree: icalendar demo

阅读更多

xmerl is a full XML functionality in Erlang, with a lot of features like XPATH, XSLT, event_function, acc_function etc. Well, now I just want to get icalendar to be parsed to form of xmerl tree, which will contain #xmlElement, #xmlAttribute, #xmlText etc, and easily to apply XPATH on it.

How about an approach that the parser just generates SAX events, and then, by attaching to a callback state machine to build a JSON or XML tree, or anything else?

I hoped xmerl is something like this, i.e. a parser to generate SAX events, and a state machine to accept the events and build the XML tree. I digged into xmerl's code, but, unfortunately, the parser and state machine are coupled together.

So I wrote a simple state machine which just receives SAX events to build a xmerl compitable XML tree. And, I applied it to icalendar.

I like this idea, by using SAX events as the common interface, I only need to write a another JSON state machine later, then, the result will be JSON of icalendar. I can share the same parser which just generates SAX events.

Here's the code, which is not completed yet, just to show how a SAX interface can serve a lot.

%%% A state machine which receives sax events and builds a xmerl compitable tree


-module(xml_sm).

-include_lib("xmerl/include/xmerl.hrl").

-export([state/2]).

-export([test/0
        ]).

-record(xmlsmState, {
    qname = undefined,
    attributes = [],
    content = [],
    parents = []
}).

receive_events(Events) -> receive_events(Events, undefined).

receive_events([], _States) -> {ok, [], []};
receive_events([Event|T], States) ->
    case state(Event, States) of 
        {ok, TopElement} -> 
            {ok, TopElement, T};
        {error, Reason} -> 
            {error, Reason};
        States1 -> 
            receive_events(T, States1)    
    end.

state({startDocument}, _StateStack) ->
    State = #xmlsmState{},
    [State];
state({endDocument}, StateStack) ->
    %io:fwrite(user, "endDocument, states: ~p~n", [StateStack]),
    case StateStack of
        {ok, TopElement} -> {ok, TopElement};
        _ -> {error, io:fwrite(user, "Bad element match, StateStack is: ~n~p~n", [StateStack])}
    end;
state({startElement, _Uri, _LocalName, QName, Attrs}, StateStack) ->
    %io:fwrite(user, "startElement~n", []),
    %% pop current State
    [State|_StatesPrev] = StateStack,
    #xmlsmState{parents=Parents} = State,
    {_Pos, Attributes1} = lists:foldl(
        fun ({Key, Value}, {Pos, AccAttrs}) ->
                Pos1 = Pos + 1,
                Attr = #xmlAttribute{name = Key,
                                     value = Value, 
                                     parents = [{QName, Pos1}|Parents]},
                {Pos1, [Attr|AccAttrs]}
        end, {0, []}, Attrs),
    Parents1 = [{QName, 0}|Parents],
    %% push new state of Attributes, Content and Parents to StateStack
    NewState = #xmlsmState{qname = QName,
                           attributes = Attributes1,
                           content = [],
                           parents = Parents1},
    [NewState|StateStack];
state({endElement, _Uri, _LocalName, QName}, StateStack) ->
    %% pop current State
    [State|StatesPrev] = StateStack,
    #xmlsmState{qname=ElemName,
                attributes=Attributes,
                content=Content,
                parents=Parents} = State,
    %io:fwrite(user, "Element end with Name: ~p~n", [Name]),
    if  QName == undefined -> %% don't care 
            undefined; 
        QName /= ElemName -> 
            throw(lists:flatten(io_lib:format(
                 "Element name match error: ~p should be ~p~n", 
                 [QName, ElemName])));
        true -> undefined
    end,
    %% composite a new element
    [_|ParentsPrev] = Parents,
    Element = #xmlElement{name = QName,
                          attributes = lists:reverse(Attributes),
                          content = lists:reverse(Content),
                          parents = ParentsPrev},
    %io:fwrite(user, "Element: ~p~n", [Element]),
    %% put Element to parent's content and return new state stack
    case StatesPrev of
        [_ParentState|[]] -> %% reached the top now, return final result
            {ok, Element};
        [ParentState|Other] ->
            #xmlsmState{content=ParentContent} = ParentState,
            ParentContent1 = [Element|ParentContent],
            %% update parent state and backward to it:
            ParentState1 = ParentState#xmlsmState{content = ParentContent1},
            %io:fwrite(user, "endElement, state: ~p~n", [State1]),
            [ParentState1|Other]
        end;
state({characters, Characters}, StateStack) ->
    %% pop current State
    [State|StatesPrev] = StateStack,
    #xmlsmState{content=Content, parents=Parents} = State,
    [{Parent, Pos}|ParentsPrev] = Parents,
    Pos1 = Pos + 1,
    Text = #xmlText{value = Characters,
                    parents = [{Parent, Pos1}|ParentsPrev]},
    Content1 = [Text|Content],
    Parents1 = [{Parent, Pos1}|ParentsPrev],
    UpdatedState = State#xmlsmState{content = Content1,
                                    parents = Parents1},
    [UpdatedState|StatesPrev].

test() ->
    Events = [
        {startDocument},
        {startElement, "", feed, feed, [{link, "http://lightpole.net"}, {author, "Caoyuan"}]},
        {characters, "feed text"},
        {startElement, "", entry, entry, [{tag, "Erlang, Function"}]},
        {characters, "Entry1's text"},
        {endElement, "", entry, entry},
        {startElement, "", entry, entry, []},
        {characters, "Entry2's text"},
        {endElement, "", entry, entry},
        {endElement, "", feed, feed},
        {endDocument}
    ],

    %% Streaming:
    {ok, Xml1, _Rest} = receive_events(Events),
    io:fwrite(user, "Streaming Result: ~n~p~n", [Xml1]),
    
    %% Stepped:
    FunCallback = fun xml_sm:state/2,
    FinalStates = lists:foldl(
        fun (Event, States) ->
                FunCallback(Event, States)
        end, undefined, Events),
    {ok, Xml2} = FinalStates,
    XmlText = lists:flatten(xmerl:export_simple([Xml2], xmerl_xml)),
    io:fwrite(user, "Stepped Result: ~n~p~n", [XmlText]).

And the primary icalendar front end:

-module(ical_parser).

-include_lib("xmerl/include/xmerl.hrl").

-export([parse/1
        ]).

-export([test/0
        ]).

-define(stateMachine, fun xml_sm:state/2).

parse(Text) ->
    States1 = ?stateMachine({startDocument}, undefined),
    States2 = parse_line(skip_ws(Text), 0, States1),
    ?stateMachine({endDocument}, States2).

parse_line([], _Line, States) -> States;
parse_line([$\s|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\t|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\r|T], Line, States) -> parse_line(T, Line, States);
parse_line([$\n|T], Line, States) -> parse_line(T, Line + 1, States);
parse_line("BEGIN"++T, Line, States) ->
    case skip_ws(T) of
        [$:|T1] -> 
            {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []),
            %io:fwrite(user, "Component started: ~p~n", [Name]),
            States1 = ?stateMachine({startElement, "", Name, Name, []}, States),
            parse_line(skip_ws(Rest), Line1, States1);
        _ -> error
    end;
parse_line("END"++T, Line, States) ->
    case skip_ws(T) of
        [$:|T1] -> 
            {Rest, Line1, Name} = parse_component_name(skip_ws(T1), Line, States, []),
            States1 = ?stateMachine({endElement, "", Name, Name}, States),
            parse_line(skip_ws(Rest), Line1, States1);
        _ -> error        
    end;
parse_line(Text, Line, States) ->
    {Rest, Line1, {Name, Params}, Value} = parse_prop(skip_ws(Text), Line, States, {[], []}),
    States1 = ?stateMachine({startElement, "", Name, Name, Params}, States),
    States2 = ?stateMachine({characters, Value}, States1),
    States3 = ?stateMachine({endElement, "", Name, Name}, States2),
    parse_line(skip_ws(Rest), Line1, States3).

parse_component_name([$\r|T], Line, States, Name) -> parse_component_name(T, Line, States, Name);
parse_component_name([$\n|T], Line, States, Name) ->
    case unfolding_line(T) of
        {true,  Rest} -> parse_component_name(Rest, Line, States, Name);
        {false, Rest} -> {Rest, Line + 1, list_to_atom(string:to_lower(lists:reverse(Name)))}
    end;
parse_component_name([H|T], Line, States, Name) ->
    parse_component_name(skip_ws(T), Line, States, [H|Name]).
    
parse_prop([$:|T], Line, States, {Name, NameParams}) ->
    PropName = list_to_atom(string:to_lower(lists:reverse(Name))),
    PropNameParams = lists:reverse(NameParams),
    %io:fwrite(user, "parsed prop name: ~p, with params: ~p~n", [PropName, NameParams]), 
    {Rest, Line1, Value} = parse_prop_value(T, Line, States, []),
    %io:fwrite(user, "parsed prop : ~p~n", [{PropName, NameParams, Value}]), 
    {Rest, Line1, {PropName, PropNameParams}, Value};
parse_prop([$;|T], Line, States, {Name, NameParams}) ->
    {Rest, Line1, ParamName, ParamValue} = parse_param(T, Line, States, []),
    parse_prop(Rest, Line1, States, {Name, [{ParamName, ParamValue}|NameParams]});
parse_prop([H|T], Line, States, {Name, NameParams}) ->
    parse_prop(skip_ws(T), Line, States, {[H|Name], NameParams}).

parse_prop_value([$\r|T], Line, States, Value) -> parse_prop_value(T, Line, States, Value);
parse_prop_value([$\n|T], Line, States, Value) ->
    case unfolding_line(T) of
        {true,  Rest} -> parse_prop_value(Rest, Line, States, Value);
        {false, Rest} -> {Rest, Line + 1, lists:reverse(Value)}
    end;
parse_prop_value([H|T], Line, States, Value) ->
    parse_prop_value(T, Line, States, [H|Value]).

parse_param([$=|T], Line, States, Name) ->
    ParamName = list_to_atom(string:to_lower(lists:reverse(Name))),
    {Rest, Line1, Value} = parse_param_value(T, Line, States, []),
    {Rest, Line1, ParamName, Value};
parse_param([H|T], Line, States, Name) ->
    parse_param(skip_ws(T), Line, States, [H|Name]).

parse_param_value([$;|T], Line, _States, Value) ->
    {T, Line, lists:reverse(Value)};
parse_param_value([$:|T], Line, _States, Value) ->
    %% keep $: for end of prop name
    {[$:|T], Line, lists:reverse(Value)};
parse_param_value([H|T], Line, States, Value) ->
    parse_param_value(T, Line, States, [H|Value]).


unfolding_line([$\s|T]) -> {true,  T}; %% space
unfolding_line([$\t|T]) -> {true,  T}; %% htab
unfolding_line(Chars)   -> {false, Chars}.
    
skip_ws([$\s|T]) -> skip_ws(T);
skip_ws([$\t|T]) -> skip_ws(T);
skip_ws(Text) -> Text.


test() ->
    Text = "
BEGIN:VCALENDAR
METHOD:PUBLISH
X-WR-CALNAME:Mi's Calendar
VERSION:2.0
PRODID:Spongecell
CALSCALE:GREGORIAN
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061206T120000
DTSTAMP:20070728T004842
LOCATION:Gordon Biersch, 640 Emerson St, Palo Alto, CA
URL:
UID:295803:spongecell.com
SUMMARY:All hands meeting
RRULE:FREQ=WEEKLY;INTERVAL=1
DTEND;TZID=America/Los_Angeles:20061206T130000
DESCRIPTION:
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061207T120000
DTSTAMP:20070728T004842
LOCATION:395 ano nuevo ave\, sunnyvale\, ca
URL:
UID:295802:spongecell.com
SUMMARY:Company lunch
RRULE:FREQ=WEEKLY;INTERVAL=1
DTEND;TZID=America/Los_Angeles:20061207T130000
DESCRIPTION:Let's have lots of beer!! (well\, and some code review :)
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20061213T123000
DTSTAMP:20070728T004842
LOCATION:369 S California Ave\, Palo Alto\, CA
URL:
UID:295714:spongecell.com
SUMMARY:Ben is back.. want to meet again
DTEND;TZID=America/Los_Angeles:20061213T133000
DESCRIPTION:Re: Ben is back.. want to meet again\n Marc
END:VEVENT
BEGIN:VEVENT
DTSTART;TZID=America/Los_Angeles:20070110T200000
DTSTAMP:20070728T004842
LOCATION:
URL:
UID:304529:spongecell.com
SUMMARY:flight back home
DTEND;TZID=America/Los_Angeles:20070110T210000
DESCRIPTION:
END:VEVENT
BEGIN:VTIMEZONE
TZID:America/Los_Angeles
BEGIN:STANDARD
DTSTART:20071104T000000
TZNAME:PST
RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU
TZOFFSETFROM:-0700
TZOFFSETTO:-0800
END:STANDARD
BEGIN:DAYLIGHT
DTSTART:20070311T000000
TZNAME:PDT
RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=1SU
TZOFFSETFROM:-0800
TZOFFSETTO:-0700
END:DAYLIGHT
END:VTIMEZONE
END:VCALENDAR


",
    io:fwrite(user, "Text: ~s~n", [Text]),
    {ok, Xml} = parse(Text),
    XmlText = lists:flatten(xmerl:export_simple([Xml], xmerl_xml)),
    io:fwrite(user, "Parsed: ~n~p~n", [XmlText]).

You may have noticed, the ?stateMachine can be pointed to a json_machine:state/2 some day, and we can get a JSON result without modification of icalendar.erl.

This also can be applied on JSON<->XML transform. Actually, I think SAX events is a good interface for various formats transform of data object. It's also a bit Erlang Style (Event passing). The parser/state-machine can communicate via SAX events as two separate processes and live with send/receive.

分享到:
评论

相关推荐

    Sublime.Text.Build.3078._Win_32bit破解主文件

    Build Systems: Renamed "keyfile" to "keyfiles", now accepting a list of files that can trigger the build system (e.g., ["Makefile", "makefile"]) Improved change detection for files that disappear and ...

    Python Programming 8 Simple Steps to Learn Python in 24 hours

    Python Programming: 8 Simple Steps to Learn Python Programming Language in 24 hours! Practical Python Programming for Beginners, Python Commands and Python Language (Python Programming Crush Course) ...

    Using LUA with Visual C++ (Introduction)

    The first one is the pointer to the LUA state, the second one is a pointer to a user-defined reader function, the third pointer is a user-defined value that the reader function will receive, and the ...

    Solaris 10 System Administration Essentials

    8.5.4 How to Identify a Defective Sector by Performing a Surface Analysis 221 8.5.5 How to Repair a Defective Sector 222 8.5.6 How to Display the Partition Table or Slice Information 223 8.5.7 ...

    TcpDemo.zip

    在TCPDemo中,首先会涉及网络编程的基本概念,如套接字(Socket)的创建、绑定(Binding)、监听(Listening)和接受(Accepting)。套接字是网络通信的端点,通过它,服务器可以接收来自客户端的连接请求。创建套接...

    Google C++ Style Guide(Google C++编程规范)高清PDF

    To guarantee uniqueness, they should be based on the full path in a project's source tree. For example, the file foo/src/bar/baz.h in project foo should have the following guard: #ifndef FOO_BAR_BAZ...

    sed-awk-2nd-edition.chm

    The book begins with an overview and a tutorial that demonstrate a progression in functionality from grep to sed to awk. sed and awk share a similar command-line syntax, accepting user instructions in...

    全套人教版八年级英语上册Unit 9同步练习题及答案14精选.doc

    Accepting: `Sure, I'd love to.` Refusing: `I'd love to, but I have too much homework to finish.` Reason: `I have to prepare for an exam.` Inviting: `Would you like to come to my birthday party?` ...

    CS193P IOS APPLICATION DEVELOPMENT Assignment 1 Walkthrough.pdf

    5. **Building a Calculator Application:** The final goal is to build a calculator application, which will serve as a practical example of applying the concepts learned in the previous steps. ...

    Manning.Spring.in.Action.4th.Edition.2014.11.epub

    17.1. A brief introduction to asynchronous messaging 17.1.1. Sending messages 17.1.2. Assessing the benefits of asynchronous messaging 17.2. Sending messages with JMS 17.2.1. Setting up a message ...

    Mastering Linux Shell Scripting 2nd Edition

    Then, you'll learn how to write a simple bash script and how to edit your bash script using Linux editors. Following this, you will learn how to define a variable and the visibility of a variable. ...

    rfc全部文档离线下载rfc1-rfc8505

    possible to query an IMP about the state of a link (although it might be possible to query an IMP about the recent history of a link -- quite a different matter!). The other primitive ...

    Packt.Python.for.Finance.2nd.Edition.2017

    - **Introduction to Pandas**: Pandas is a powerful data manipulation library designed to make working with relational or labeled data both easy and intuitive. It provides data structures for ...

    visual assist 10.5.1707

    The text caret is placed in the correct location after accepting a .NET Generic from a suggestion list in VB 2008. (case=20259) 8257 When typing a parameter in a C# LINQ predicate function, focus is ...

    visual assist 1707破解

    * The text caret is placed in the correct location after accepting a .NET Generic from a suggestion list in VB 2008. (case=20259) 8257 * When typing a parameter in a C# LINQ predicate function, ...

    Accepting-Payment:接受付款

    "Accepting-Payment:接受付款"这个主题主要关注如何通过技术手段处理和管理用户的支付流程,确保交易的安全和顺畅。在这个过程中,CSS(Cascading Style Sheets)虽然不是核心的支付处理技术,但作为前端样式控制的...

Global site tag (gtag.js) - Google Analytics