diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..fa0127ec2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,70 @@ +# Directories # +/build/ +target/ + +# OS Files # +.DS_Store + +*.class + +# Package Files # +*.jar +# But not these files... +!/hoodie-cli/lib/dnl/utils/textutils/0.3.3/textutils-0.3.3.jar +*.war +*.ear +*.db + +###################### +# OSX +###################### + +.DS_Store + +# Thumbnails +._* + +###################### +# Eclipse +###################### + +*.pydevproject +.project +.metadata +tmp/** +tmp/**/* +*.tmp +*.bak +*.swp +*.pyc +*~.nib +local.properties +.classpath +.settings/ +.loadpath +/src/main/resources/rebel.xml +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + +####################################### +# IntelliJ specific files/directories # +####################################### +.out +.idea +*.ipr +*.iws +*.iml + +####################################### +# Maven +####################################### +dependency-reduced-pom.xml diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 000000000..a49fc5ef2 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,614 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +---- +This project bundles portions of the 'JQuery' project under the terms of the MIT license. + + Copyright 2012 jQuery Foundation and other contributors + http://jquery.com/ + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +---- +This project bundles a derivative of portions of the 'Asciidoctor' project +under the terms of the MIT license. + + The MIT License + Copyright (C) 2012-2015 Dan Allen, Ryan Waldron and the Asciidoctor Project + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +---- +This project incorporates portions of the 'Protocol Buffers' project avaialble +under a '3-clause BSD' license. + + Copyright 2008, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Code generated by the Protocol Buffer compiler is owned by the owner + of the input file used when generating it. This code is not + standalone and requires a support library to be linked with it. This + support library is itself covered by the above license. + +---- +This project bundles a derivative image for our Orca Logo. This image is +available under the Creative Commons By Attribution 3.0 License. + + Creative Commons Legal Code + + Attribution 3.0 Unported + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR + DAMAGES RESULTING FROM ITS USE. + + License + + THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE + COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY + COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS + AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. + + BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE + TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY + BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS + CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND + CONDITIONS. + + 1. Definitions + + a. "Adaptation" means a work based upon the Work, or upon the Work and + other pre-existing works, such as a translation, adaptation, + derivative work, arrangement of music or other alterations of a + literary or artistic work, or phonogram or performance and includes + cinematographic adaptations or any other form in which the Work may be + recast, transformed, or adapted including in any form recognizably + derived from the original, except that a work that constitutes a + Collection will not be considered an Adaptation for the purpose of + this License. For the avoidance of doubt, where the Work is a musical + work, performance or phonogram, the synchronization of the Work in + timed-relation with a moving image ("synching") will be considered an + Adaptation for the purpose of this License. + b. "Collection" means a collection of literary or artistic works, such as + encyclopedias and anthologies, or performances, phonograms or + broadcasts, or other works or subject matter other than works listed + in Section 1(f) below, which, by reason of the selection and + arrangement of their contents, constitute intellectual creations, in + which the Work is included in its entirety in unmodified form along + with one or more other contributions, each constituting separate and + independent works in themselves, which together are assembled into a + collective whole. A work that constitutes a Collection will not be + considered an Adaptation (as defined above) for the purposes of this + License. + c. "Distribute" means to make available to the public the original and + copies of the Work or Adaptation, as appropriate, through sale or + other transfer of ownership. + d. "Licensor" means the individual, individuals, entity or entities that + offer(s) the Work under the terms of this License. + e. "Original Author" means, in the case of a literary or artistic work, + the individual, individuals, entity or entities who created the Work + or if no individual or entity can be identified, the publisher; and in + addition (i) in the case of a performance the actors, singers, + musicians, dancers, and other persons who act, sing, deliver, declaim, + play in, interpret or otherwise perform literary or artistic works or + expressions of folklore; (ii) in the case of a phonogram the producer + being the person or legal entity who first fixes the sounds of a + performance or other sounds; and, (iii) in the case of broadcasts, the + organization that transmits the broadcast. + f. "Work" means the literary and/or artistic work offered under the terms + of this License including without limitation any production in the + literary, scientific and artistic domain, whatever may be the mode or + form of its expression including digital form, such as a book, + pamphlet and other writing; a lecture, address, sermon or other work + of the same nature; a dramatic or dramatico-musical work; a + choreographic work or entertainment in dumb show; a musical + composition with or without words; a cinematographic work to which are + assimilated works expressed by a process analogous to cinematography; + a work of drawing, painting, architecture, sculpture, engraving or + lithography; a photographic work to which are assimilated works + expressed by a process analogous to photography; a work of applied + art; an illustration, map, plan, sketch or three-dimensional work + relative to geography, topography, architecture or science; a + performance; a broadcast; a phonogram; a compilation of data to the + extent it is protected as a copyrightable work; or a work performed by + a variety or circus performer to the extent it is not otherwise + considered a literary or artistic work. + g. "You" means an individual or entity exercising rights under this + License who has not previously violated the terms of this License with + respect to the Work, or who has received express permission from the + Licensor to exercise rights under this License despite a previous + violation. + h. "Publicly Perform" means to perform public recitations of the Work and + to communicate to the public those public recitations, by any means or + process, including by wire or wireless means or public digital + performances; to make available to the public Works in such a way that + members of the public may access these Works from a place and at a + place individually chosen by them; to perform the Work to the public + by any means or process and the communication to the public of the + performances of the Work, including by public digital performance; to + broadcast and rebroadcast the Work by any means including signs, + sounds or images. + i. "Reproduce" means to make copies of the Work by any means including + without limitation by sound or visual recordings and the right of + fixation and reproducing fixations of the Work, including storage of a + protected performance or phonogram in digital form or other electronic + medium. + + 2. Fair Dealing Rights. Nothing in this License is intended to reduce, + limit, or restrict any uses free from copyright or rights arising from + limitations or exceptions that are provided for in connection with the + copyright protection under copyright law or other applicable laws. + + 3. License Grant. Subject to the terms and conditions of this License, + Licensor hereby grants You a worldwide, royalty-free, non-exclusive, + perpetual (for the duration of the applicable copyright) license to + exercise the rights in the Work as stated below: + + a. to Reproduce the Work, to incorporate the Work into one or more + Collections, and to Reproduce the Work as incorporated in the + Collections; + b. to create and Reproduce Adaptations provided that any such Adaptation, + including any translation in any medium, takes reasonable steps to + clearly label, demarcate or otherwise identify that changes were made + to the original Work. For example, a translation could be marked "The + original work was translated from English to Spanish," or a + modification could indicate "The original work has been modified."; + c. to Distribute and Publicly Perform the Work including as incorporated + in Collections; and, + d. to Distribute and Publicly Perform Adaptations. + e. For the avoidance of doubt: + + i. Non-waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme cannot be waived, the Licensor + reserves the exclusive right to collect such royalties for any + exercise by You of the rights granted under this License; + ii. Waivable Compulsory License Schemes. In those jurisdictions in + which the right to collect royalties through any statutory or + compulsory licensing scheme can be waived, the Licensor waives the + exclusive right to collect such royalties for any exercise by You + of the rights granted under this License; and, + iii. Voluntary License Schemes. The Licensor waives the right to + collect royalties, whether individually or, in the event that the + Licensor is a member of a collecting society that administers + voluntary licensing schemes, via that society, from any exercise + by You of the rights granted under this License. + + The above rights may be exercised in all media and formats whether now + known or hereafter devised. The above rights include the right to make + such modifications as are technically necessary to exercise the rights in + other media and formats. Subject to Section 8(f), all rights not expressly + granted by Licensor are hereby reserved. + + 4. Restrictions. The license granted in Section 3 above is expressly made + subject to and limited by the following restrictions: + + a. You may Distribute or Publicly Perform the Work only under the terms + of this License. You must include a copy of, or the Uniform Resource + Identifier (URI) for, this License with every copy of the Work You + Distribute or Publicly Perform. You may not offer or impose any terms + on the Work that restrict the terms of this License or the ability of + the recipient of the Work to exercise the rights granted to that + recipient under the terms of the License. You may not sublicense the + Work. You must keep intact all notices that refer to this License and + to the disclaimer of warranties with every copy of the Work You + Distribute or Publicly Perform. When You Distribute or Publicly + Perform the Work, You may not impose any effective technological + measures on the Work that restrict the ability of a recipient of the + Work from You to exercise the rights granted to that recipient under + the terms of the License. This Section 4(a) applies to the Work as + incorporated in a Collection, but this does not require the Collection + apart from the Work itself to be made subject to the terms of this + License. If You create a Collection, upon notice from any Licensor You + must, to the extent practicable, remove from the Collection any credit + as required by Section 4(b), as requested. If You create an + Adaptation, upon notice from any Licensor You must, to the extent + practicable, remove from the Adaptation any credit as required by + Section 4(b), as requested. + b. If You Distribute, or Publicly Perform the Work or any Adaptations or + Collections, You must, unless a request has been made pursuant to + Section 4(a), keep intact all copyright notices for the Work and + provide, reasonable to the medium or means You are utilizing: (i) the + name of the Original Author (or pseudonym, if applicable) if supplied, + and/or if the Original Author and/or Licensor designate another party + or parties (e.g., a sponsor institute, publishing entity, journal) for + attribution ("Attribution Parties") in Licensor's copyright notice, + terms of service or by other reasonable means, the name of such party + or parties; (ii) the title of the Work if supplied; (iii) to the + extent reasonably practicable, the URI, if any, that Licensor + specifies to be associated with the Work, unless such URI does not + refer to the copyright notice or licensing information for the Work; + and (iv) , consistent with Section 3(b), in the case of an Adaptation, + a credit identifying the use of the Work in the Adaptation (e.g., + "French translation of the Work by Original Author," or "Screenplay + based on original Work by Original Author"). The credit required by + this Section 4 (b) may be implemented in any reasonable manner; + provided, however, that in the case of a Adaptation or Collection, at + a minimum such credit will appear, if a credit for all contributing + authors of the Adaptation or Collection appears, then as part of these + credits and in a manner at least as prominent as the credits for the + other contributing authors. For the avoidance of doubt, You may only + use the credit required by this Section for the purpose of attribution + in the manner set out above and, by exercising Your rights under this + License, You may not implicitly or explicitly assert or imply any + connection with, sponsorship or endorsement by the Original Author, + Licensor and/or Attribution Parties, as appropriate, of You or Your + use of the Work, without the separate, express prior written + permission of the Original Author, Licensor and/or Attribution + Parties. + c. Except as otherwise agreed in writing by the Licensor or as may be + otherwise permitted by applicable law, if You Reproduce, Distribute or + Publicly Perform the Work either by itself or as part of any + Adaptations or Collections, You must not distort, mutilate, modify or + take other derogatory action in relation to the Work which would be + prejudicial to the Original Author's honor or reputation. Licensor + agrees that in those jurisdictions (e.g. Japan), in which any exercise + of the right granted in Section 3(b) of this License (the right to + make Adaptations) would be deemed to be a distortion, mutilation, + modification or other derogatory action prejudicial to the Original + Author's honor and reputation, the Licensor will waive or not assert, + as appropriate, this Section, to the fullest extent permitted by the + applicable national law, to enable You to reasonably exercise Your + right under Section 3(b) of this License (right to make Adaptations) + but not otherwise. + + 5. Representations, Warranties and Disclaimer + + UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR + OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY + KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, + INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, + FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF + LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, + WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION + OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. + + 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE + LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR + ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES + ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS + BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + + 7. Termination + + a. This License and the rights granted hereunder will terminate + automatically upon any breach by You of the terms of this License. + Individuals or entities who have received Adaptations or Collections + from You under this License, however, will not have their licenses + terminated provided such individuals or entities remain in full + compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will + survive any termination of this License. + b. Subject to the above terms and conditions, the license granted here is + perpetual (for the duration of the applicable copyright in the Work). + Notwithstanding the above, Licensor reserves the right to release the + Work under different license terms or to stop distributing the Work at + any time; provided, however that any such election will not serve to + withdraw this License (or any other license that has been, or is + required to be, granted under the terms of this License), and this + License will continue in full force and effect unless terminated as + stated above. + + 8. Miscellaneous + + a. Each time You Distribute or Publicly Perform the Work or a Collection, + the Licensor offers to the recipient a license to the Work on the same + terms and conditions as the license granted to You under this License. + b. Each time You Distribute or Publicly Perform an Adaptation, Licensor + offers to the recipient a license to the original Work on the same + terms and conditions as the license granted to You under this License. + c. If any provision of this License is invalid or unenforceable under + applicable law, it shall not affect the validity or enforceability of + the remainder of the terms of this License, and without further action + by the parties to this agreement, such provision shall be reformed to + the minimum extent necessary to make such provision valid and + enforceable. + d. No term or provision of this License shall be deemed waived and no + breach consented to unless such waiver or consent shall be in writing + and signed by the party to be charged with such waiver or consent. + e. This License constitutes the entire agreement between the parties with + respect to the Work licensed here. There are no understandings, + agreements or representations with respect to the Work not specified + here. Licensor shall not be bound by any additional provisions that + may appear in any communication from You. This License may not be + modified without the mutual written agreement of the Licensor and You. + f. The rights granted under, and the subject matter referenced, in this + License were drafted utilizing the terminology of the Berne Convention + for the Protection of Literary and Artistic Works (as amended on + September 28, 1979), the Rome Convention of 1961, the WIPO Copyright + Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 + and the Universal Copyright Convention (as revised on July 24, 1971). + These rights and subject matter take effect in the relevant + jurisdiction in which the License terms are sought to be enforced + according to the corresponding provisions of the implementation of + those treaty provisions in the applicable national law. If the + standard suite of rights granted under applicable copyright law + includes additional rights not granted under this License, such + additional rights are deemed to be included in the License; this + License is not intended to restrict the license of any rights under + applicable law. + + + Creative Commons Notice + + Creative Commons is not a party to this License, and makes no warranty + whatsoever in connection with the Work. Creative Commons will not be + liable to You or any party on any legal theory for any damages + whatsoever, including without limitation any general, special, + incidental or consequential damages arising in connection to this + license. Notwithstanding the foregoing two (2) sentences, if Creative + Commons has expressly identified itself as the Licensor hereunder, it + shall have all rights and obligations of Licensor. + + Except for the limited purpose of indicating to the public that the + Work is licensed under the CCPL, Creative Commons does not authorize + the use by either party of the trademark "Creative Commons" or any + related trademark or logo of Creative Commons without the prior + written consent of Creative Commons. Any permitted use will be in + compliance with Creative Commons' then-current trademark usage + guidelines, as may be published on its website or otherwise made + available upon request from time to time. For the avoidance of doubt, + this trademark restriction does not form part of this License. + + Creative Commons may be contacted at https://creativecommons.org/. diff --git a/README.md b/README.md new file mode 100644 index 000000000..5a3d3f556 --- /dev/null +++ b/README.md @@ -0,0 +1,205 @@ +Hoodie - Spark Library For Upserts & Incremental Consumption +============================================================= + +- - - - + +# Core Functionality # + +Hoodie provides the following abilities on a Hive table + + * Upsert (how do I change the table efficiently?) + * Incremental consumption (how do I obtain records that changed?) + + +Ultimately, make the built Hive table, queryable via Spark & Presto as well. + + +# Code & Project Structure # + + * hoodie-client : Spark client library to take a bunch of inserts + updates and apply them to a Hoodie table + * hoodie-common : Common code shared between different artifacts of Hoodie + + + We have embraced the [Google Java code style](https://google.github.io/styleguide/javaguide.html). Please setup your IDE accordingly with style files from [here] (https://github.com/google/styleguide) + + +# Quickstart # + +Check out code and pull it into Intellij as a normal maven project. +> You might want to add your spark assembly jar to project dependencies under "Module Setttings", to be able to run Spark from IDE + +Setup your local hadoop/hive test environment. See [this](http://www.bytearray.io/2016/05/setting-up-hadoopyarnsparkhive-on-mac.html) for reference + +## Run the Hoodie Test Job ## + +Create the output folder on your local HDFS +``` +hdfs dfs -mkdir -p /tmp/hoodie/sample-table +``` + +You can run the __HoodieClientExample__ class, to place a set of inserts + updates onto your HDFS at /tmp/hoodie/sample-table + +## Access via Hive ## + +Add in the hoodie-mr jar so, Hive can pick up the right files to hit, to answer the query. + +``` +hive> add jar file:///tmp/hoodie-mr-0.1.jar; +Added [file:///tmp/hoodie-mr-0.1.jar] to class path +Added resources: [file:///tmp/hoodie-mr-0.1.jar] +``` + +Then, you need to create a table and register the sample partitions + + +``` +drop table hoodie_test; +CREATE EXTERNAL TABLE hoodie_test(`_row_key` string, +`_hoodie_commit_time` string, +`_hoodie_commit_seqno` string, + rider string, + driver string, + begin_lat double, + begin_lon double, + end_lat double, + end_lon double, + fare double) +PARTITIONED BY (`datestr` string) +ROW FORMAT SERDE + 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' +STORED AS INPUTFORMAT + 'com.uber.hoodie.hadoop.HoodieInputFormat' +OUTPUTFORMAT + 'com.uber.hoodie.hadoop.HoodieOutputFormat' +LOCATION + 'hdfs:///tmp/hoodie/sample-table'; + +ALTER TABLE `hoodie_test` ADD IF NOT EXISTS PARTITION (datestr='2016-03-15') LOCATION 'hdfs:///tmp/hoodie/sample-table/2016/03/15'; +ALTER TABLE `hoodie_test` ADD IF NOT EXISTS PARTITION (datestr='2015-03-16') LOCATION 'hdfs:///tmp/hoodie/sample-table/2015/03/16'; +ALTER TABLE `hoodie_test` ADD IF NOT EXISTS PARTITION (datestr='2015-03-17') LOCATION 'hdfs:///tmp/hoodie/sample-table/2015/03/17'; +``` + +Let's first perform a query on the latest committed snapshot of the table + +``` +hive> select count(*) from hoodie_test; +... +OK +100 +Time taken: 18.05 seconds, Fetched: 1 row(s) +hive> +``` + + +Let's now perform a query, to obtain the changed rows since a commit in the past + +``` +hive> set hoodie.scan.mode=INCREMENTAL; +hive> set hoodie.last.commitTs=001; +hive> select `_hoodie_commit_time`, rider, driver from hoodie_test limit 10; +OK +All commits :[001, 002] +002 rider-001 driver-001 +002 rider-001 driver-001 +002 rider-002 driver-002 +002 rider-001 driver-001 +002 rider-001 driver-001 +002 rider-002 driver-002 +002 rider-001 driver-001 +002 rider-002 driver-002 +002 rider-002 driver-002 +002 rider-001 driver-001 +Time taken: 0.056 seconds, Fetched: 10 row(s) +hive> +hive> +``` + + +## Access via Spark ## + +Spark is super easy, once you get Hive working as above. Just spin up a Spark Shell as below + +``` +$ cd $SPARK_INSTALL +$ export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +$ spark-shell --jars /tmp/hoodie-mr-0.1.jar --driver-class-path $HADOOP_CONF_DIR --conf spark.sql.hive.convertMetastoreParquet=false + + +scala> sqlContext.sql("show tables").show(10000) +scala> sqlContext.sql("describe hoodie_test").show(10000) +scala> sqlContext.sql("select count(*) from hoodie_test").show(10000) +``` + + + +## Access via Presto ## + +Checkout the 'hoodie-integration' branch, build off it, and place your installation somewhere. + +* Copy the hoodie-mr jar into $PRESTO_INSTALL/plugin/hive-hadoop2/ + +* Change your catalog config, to make presto respect the __HoodieInputFormat__ + +``` +$ cat etc/catalog/hive.properties +connector.name=hive-hadoop2 +hive.metastore.uri=thrift://localhost:10000 +hive.respect-input-format-splits=true +``` + +startup your server and you should be able to query the same Hive table via Presto + +``` +show columns from hive.default.hoodie_test; +select count(*) from hive.default.hoodie_test +``` + +> NOTE: As of now, Presto has trouble accessing HDFS locally, hence create a new table as above, backed on local filesystem file:// as a workaround + +# Planned # +* Support for Self Joins - As of now, you cannot incrementally consume the same table more than once, since the InputFormat does not understand the QueryPlan. +* Hoodie Spark Datasource - Allows for reading and writing data back using Apache Spark natively (without falling back to InputFormat), which can be more performant +* Hoodie Presto Connector - Allows for querying data managed by Hoodie using Presto natively, which can again boost [performance](https://prestodb.io/docs/current/release/release-0.138.html) + + +# Hoodie Admin CLI +# Launching Command Line # + + + +* mvn clean install in hoodie-cli +* ./hoodie-cli + +If all is good you should get a command prompt similar to this one +``` +prasanna@hadoopgw01-sjc1:~/hoodie/hoodie-cli$ ./hoodie-cli.sh +16/07/13 21:27:47 INFO xml.XmlBeanDefinitionReader: Loading XML bean definitions from URL [jar:file:/home/prasanna/hoodie/hoodie-cli/target/hoodie-cli-0.1-SNAPSHOT.jar!/META-INF/spring/spring-shell-plugin.xml] +16/07/13 21:27:47 INFO support.GenericApplicationContext: Refreshing org.springframework.context.support.GenericApplicationContext@372688e8: startup date [Wed Jul 13 21:27:47 UTC 2016]; root of context hierarchy +16/07/13 21:27:47 INFO annotation.AutowiredAnnotationBeanPostProcessor: JSR-330 'javax.inject.Inject' annotation found and supported for autowiring +============================================ +* * +* _ _ _ _ * +* | | | | | (_) * +* | |__| | ___ ___ __| |_ ___ * +* | __ |/ _ \ / _ \ / _` | |/ _ \ * +* | | | | (_) | (_) | (_| | | __/ * +* |_| |_|\___/ \___/ \__,_|_|\___| * +* * +============================================ + +Welcome to Hoodie CLI. Please type help if you are looking for help. +hoodie-> +``` + +# Commands # + + * connect --path [dataset_path] : Connect to the specific dataset by its path + * commits show : Show all details about the commits + * commits refresh : Refresh the commits from HDFS + * commit rollback --commit [commitTime] : Rollback a commit + * commit showfiles --commit [commitTime] : Show details of a commit (lists all the files modified along with other metrics) + * commit showpartitions --commit [commitTime] : Show details of a commit (lists statistics aggregated at partition level) + + * commits compare --path [otherBasePath] : Compares the current dataset commits with the path provided and tells you how many commits behind or ahead + * stats wa : Calculate commit level and overall write amplification factor (total records written / total records upserted) + * help diff --git a/hoodie-cli/pom.xml b/hoodie-cli/pom.xml index 9238c82e9..c20792e68 100644 --- a/hoodie-cli/pom.xml +++ b/hoodie-cli/pom.xml @@ -187,16 +187,6 @@ hoodie-common ${project.version} - - com.uber.hoodie - hoodie-mr - ${project.version} - - - com.uber.hoodie - hoodie-tools - ${project.version} - junit junit-dep diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala index 0eccd9bf6..2343035bb 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala @@ -22,7 +22,7 @@ import com.uber.hoodie.common.model.HoodieRecord import com.uber.hoodie.common.util.ParquetUtils import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig} import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter} -import com.uber.hoodie.stream.GenericHoodiePayload +import com.uber.hoodie.common.GenericHoodiePayload import org.apache.avro.Schema import org.apache.avro.generic.IndexedRecord import org.apache.hadoop.conf.Configuration diff --git a/hoodie-client/pom.xml b/hoodie-client/pom.xml new file mode 100644 index 000000000..a1d118ccd --- /dev/null +++ b/hoodie-client/pom.xml @@ -0,0 +1,142 @@ + + + + + + hoodie + com.uber.hoodie + 0.2.5-SNAPSHOT + + 4.0.0 + + hoodie-client + jar + + + + org.codehaus.mojo + cobertura-maven-plugin + + + org.apache.maven.plugins + maven-jar-plugin + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + + + + + src/main/resources + + + src/test/resources + + + + + + + com.uber.hoodie + hoodie-common + ${project.version} + + + com.uber.hoodie + hoodie-common + ${project.version} + test-jar + test + + + io.dropwizard.metrics + metrics-graphite + + + io.dropwizard.metrics + metrics-core + + + + + log4j + log4j + + + + org.apache.hadoop + hadoop-client + + + javax.servlet + * + + + + + + org.apache.parquet + parquet-avro + + + + org.apache.parquet + parquet-hadoop + + + + com.google.guava + guava + + + + org.apache.spark + spark-core_2.10 + provided + + + + org.apache.spark + spark-sql_2.10 + ${spark.version} + provided + + + + org.apache.hbase + hbase-client + + + + org.mockito + mockito-all + 1.10.19 + test + + + + + diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java new file mode 100644 index 000000000..09aa2dbc5 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie; + +import com.google.common.base.Optional; + +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.model.HoodieCommits; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.exception.HoodieException; +import com.uber.hoodie.index.HoodieBloomIndex; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.StructType; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import scala.Tuple2; + +/** + * Provides first class support for accessing Hoodie tables for data processing via Apache Spark. + * + * + * TODO: Need to move all read operations here, since Hoodie is a single writer & multiple reader + */ +public class HoodieReadClient implements Serializable { + + private static Logger logger = LogManager.getLogger(HoodieReadClient.class); + + private transient final JavaSparkContext jsc; + + private transient final FileSystem fs; + /** + * TODO: We need to persist the index type into hoodie.properties & be able to access the index + * just with a simple basepath pointing to the dataset. Until, then just always assume a + * BloomIndex + */ + private transient final HoodieBloomIndex index; + private HoodieTableMetadata metadata; + private transient Optional sqlContextOpt; + + + /** + * @param basePath path to Hoodie dataset + */ + public HoodieReadClient(JavaSparkContext jsc, String basePath) { + this.jsc = jsc; + this.fs = FSUtils.getFs(); + this.metadata = new HoodieTableMetadata(fs, basePath); + this.index = new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); + this.sqlContextOpt = Optional.absent(); + } + + /** + * + * @param jsc + * @param basePath + * @param sqlContext + */ + public HoodieReadClient(JavaSparkContext jsc, String basePath, SQLContext sqlContext) { + this(jsc, basePath); + this.sqlContextOpt = Optional.of(sqlContext); + } + + /** + * Adds support for accessing Hoodie built tables from SparkSQL, as you normally would. + * + * @return SparkConf object to be used to construct the SparkContext by caller + */ + public static SparkConf addHoodieSupport(SparkConf conf) { + conf.set("spark.sql.hive.convertMetastoreParquet", "false"); + return conf; + } + + private void assertSqlContext() { + if (!sqlContextOpt.isPresent()) { + throw new IllegalStateException("SQLContext must be set, when performing dataframe operations"); + } + } + + /** + * Given a bunch of hoodie keys, fetches all the individual records out as a data frame + * + * @return a dataframe + */ + public DataFrame read(JavaRDD hoodieKeys, int parallelism) + throws Exception { + + assertSqlContext(); + JavaPairRDD> keyToFileRDD = + index.fetchRecordLocation(hoodieKeys, metadata); + List paths = keyToFileRDD + .filter(new Function>, Boolean>() { + @Override + public Boolean call(Tuple2> keyFileTuple) throws Exception { + return keyFileTuple._2().isPresent(); + } + }) + .map(new Function>, String>() { + + @Override + public String call(Tuple2> keyFileTuple) throws Exception { + return keyFileTuple._2().get(); + } + }).collect(); + + // record locations might be same for multiple keys, so need a unique list + Set uniquePaths = new HashSet<>(paths); + DataFrame originalDF = sqlContextOpt.get().read() + .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); + StructType schema = originalDF.schema(); + JavaPairRDD keyRowRDD = originalDF.javaRDD() + .mapToPair(new PairFunction() { + @Override + public Tuple2 call(Row row) throws Exception { + HoodieKey key = new HoodieKey( + row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), + row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + return new Tuple2<>(key, row); + } + }); + + // Now, we need to further filter out, for only rows that match the supplied hoodie keys + JavaRDD rowRDD = keyRowRDD.join(keyToFileRDD, parallelism) + .map(new Function>>, Row>() { + @Override + public Row call(Tuple2>> tuple) throws Exception { + return tuple._2()._1(); + } + }); + + return sqlContextOpt.get().createDataFrame(rowRDD, schema); + } + + /** + * Reads the paths under the a hoodie dataset out as a DataFrame + */ + public DataFrame read(String... paths) { + assertSqlContext(); + List filteredPaths = new ArrayList<>(); + try { + for (String path : paths) { + if (!path.contains(metadata.getBasePath())) { + throw new HoodieException("Path " + path + + " does not seem to be a part of a Hoodie dataset at base path " + + metadata.getBasePath()); + } + + FileStatus[] latestFiles = metadata.getLatestVersions(fs.globStatus(new Path(path))); + for (FileStatus file : latestFiles) { + filteredPaths.add(file.getPath().toString()); + } + } + return sqlContextOpt.get().read() + .parquet(filteredPaths.toArray(new String[filteredPaths.size()])); + } catch (Exception e) { + throw new HoodieException("Error reading hoodie dataset as a dataframe", e); + } + } + + /** + * Obtain all new data written into the Hoodie dataset since the given timestamp. + * + * If you made a prior call to {@link HoodieReadClient#latestCommit()}, it gives you all data in + * the time window (commitTimestamp, latestCommit) + */ + public DataFrame readSince(String lastCommitTimestamp) { + + List commitsToReturn = metadata.findCommitsAfter(lastCommitTimestamp, Integer.MAX_VALUE); + //TODO: we can potentially trim this down to only affected partitions, using CommitMetadata + try { + + // Go over the commit metadata, and obtain the new files that need to be read. + HashMap fileIdToFullPath = new HashMap<>(); + for (String commit: commitsToReturn) { + // get files from each commit, and replace any previous versions + fileIdToFullPath.putAll(metadata.getCommitMetadata(commit).getFileIdAndFullPaths()); + } + + return sqlContextOpt.get().read() + .parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()])) + .filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTimestamp)); + } catch (IOException e) { + throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTimestamp, e); + } + } + + /** + * Obtain + */ + public DataFrame readCommit(String commitTime) { + assertSqlContext(); + HoodieCommits commits = metadata.getAllCommits(); + if (!commits.contains(commitTime)) { + new HoodieException("No commit exists at " + commitTime); + } + + try { + HoodieCommitMetadata commitMetdata = metadata.getCommitMetadata(commitTime); + Collection paths = commitMetdata.getFileIdAndFullPaths().values(); + return sqlContextOpt.get().read() + .parquet(paths.toArray(new String[paths.size()])) + .filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)); + } catch (Exception e) { + throw new HoodieException("Error reading commit " + commitTime, e); + } + } + + /** + * Checks if the given [Keys] exists in the hoodie table and returns [Key, + * Optional] If the optional FullFilePath value is not present, then the key is + * not found. If the FullFilePath value is present, it is the path component (without scheme) of + * the URI underlying file + */ + public JavaPairRDD> checkExists( + JavaRDD hoodieKeys) { + return index.fetchRecordLocation(hoodieKeys, metadata); + } + + /** + * Filter out HoodieRecords that already exists in the output folder. This is useful in + * deduplication. + * + * @param hoodieRecords Input RDD of Hoodie records. + * @return A subset of hoodieRecords RDD, with existing records filtered out. + */ + public JavaRDD filterExists(JavaRDD hoodieRecords) { + JavaRDD recordsWithLocation = index.tagLocation(hoodieRecords, metadata); + return recordsWithLocation.filter(new Function() { + @Override + public Boolean call(HoodieRecord v1) throws Exception { + return !v1.isCurrentLocationKnown(); + } + }); + } + + /** + * Checks if the Hoodie dataset has new data since given timestamp. This can be subsequently + * used to call {@link HoodieReadClient#readSince(String)} to perform incremental processing. + */ + public boolean hasNewCommits(String commitTimestamp) { + return listCommitsSince(commitTimestamp).size() > 0; + } + + /** + * + * @param commitTimestamp + * @return + */ + public List listCommitsSince(String commitTimestamp) { + return metadata.getAllCommits().findCommitsAfter(commitTimestamp, Integer.MAX_VALUE); + } + + /** + * Returns the last successful commit (a successful write operation) into a Hoodie table. + */ + public String latestCommit() { + return metadata.getAllCommits().lastCommit(); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java new file mode 100644 index 000000000..edc1e8162 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java @@ -0,0 +1,556 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie; + +import com.codahale.metrics.Timer; +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.exception.HoodieCommitException; +import com.uber.hoodie.exception.HoodieIOException; +import com.uber.hoodie.exception.HoodieInsertException; +import com.uber.hoodie.exception.HoodieRollbackException; +import com.uber.hoodie.exception.HoodieUpsertException; +import com.uber.hoodie.func.InsertMapFunction; +import com.uber.hoodie.index.HoodieIndex; +import com.uber.hoodie.io.HoodieCleaner; +import com.uber.hoodie.io.HoodieCommitArchiveLog; +import com.uber.hoodie.metrics.HoodieMetrics; +import com.uber.hoodie.table.HoodieTable; +import com.uber.hoodie.table.WorkloadProfile; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.Accumulator; +import org.apache.spark.Partitioner; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.api.java.function.VoidFunction; +import org.apache.spark.storage.StorageLevel; + +import java.io.IOException; +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.Collections; +import java.util.Date; +import java.util.Iterator; +import java.util.List; + +import scala.Option; +import scala.Tuple2; + +/** + * Hoodie Write Client helps you build datasets on HDFS [insert()] and then + * perform efficient mutations on a HDFS dataset [upsert()] + * + * Note that, at any given time, there can only be one Spark job performing + * these operatons on a Hoodie dataset. + * + */ +public class HoodieWriteClient implements Serializable { + + private static Logger logger = LogManager.getLogger(HoodieWriteClient.class); + private transient final FileSystem fs; + private transient final JavaSparkContext jsc; + private final HoodieWriteConfig config; + private transient final HoodieMetrics metrics; + private transient final HoodieIndex index; + private transient final HoodieCommitArchiveLog archiveLog; + private transient Timer.Context writeContext = null; + + private final SimpleDateFormat FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); + + /** + * @param jsc + * @param clientConfig + * @throws Exception + */ + public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception { + this(jsc, clientConfig, false); + } + + /** + * @param jsc + * @param clientConfig + * @param rollbackInFlight + * @throws Exception + */ + public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) { + this.fs = FSUtils.getFs(); + this.jsc = jsc; + this.config = clientConfig; + this.index = HoodieIndex.createIndex(config, jsc); + this.metrics = new HoodieMetrics(config, config.getTableName()); + this.archiveLog = new HoodieCommitArchiveLog(clientConfig); + if (rollbackInFlight) { + rollbackInflightCommits(); + } + } + + /** + * Filter out HoodieRecords that already exists in the output folder. This is useful in + * deduplication. + * + * @param hoodieRecords Input RDD of Hoodie records. + * @return A subset of hoodieRecords RDD, with existing records filtered out. + */ + public JavaRDD> filterExists(JavaRDD> hoodieRecords) { + final HoodieTableMetadata metadata = + new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + JavaRDD> recordsWithLocation = index.tagLocation(hoodieRecords, metadata); + return recordsWithLocation.filter(new Function, Boolean>() { + @Override + public Boolean call(HoodieRecord v1) throws Exception { + return !v1.isCurrentLocationKnown(); + } + }); + } + + /** + * Upserts a bunch of new records into the Hoodie table, at the supplied commitTime + */ + public JavaRDD upsert(JavaRDD> records, final String commitTime) { + final HoodieTableMetadata metadata = + new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + writeContext = metrics.getCommitCtx(); + final HoodieTable table = + HoodieTable.getHoodieTable(metadata.getTableType(), commitTime, config, metadata); + + try { + // De-dupe/merge if needed + JavaRDD> dedupedRecords = + combineOnCondition(config.shouldCombineBeforeUpsert(), records, + config.getUpsertShuffleParallelism()); + + // perform index loop up to get existing location of records + JavaRDD> taggedRecords = index.tagLocation(dedupedRecords, metadata); + + // Cache the tagged records, so we don't end up computing both + taggedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER()); + + + WorkloadProfile profile = null; + if (table.isWorkloadProfileNeeded()) { + profile = new WorkloadProfile(taggedRecords); + logger.info("Workload profile :" + profile); + } + + // obtain the upsert partitioner, and the run the tagger records through that & get a partitioned RDD. + final Partitioner upsertPartitioner = table.getUpsertPartitioner(profile); + JavaRDD> partitionedRecords = taggedRecords.mapToPair( + new PairFunction, Tuple2>, HoodieRecord>() { + @Override + public Tuple2>, HoodieRecord> call( + HoodieRecord record) throws Exception { + return new Tuple2<>(new Tuple2<>(record.getKey(), + Option.apply(record.getCurrentLocation())), record); + } + }).partitionBy(upsertPartitioner).map( + new Function>, HoodieRecord>, HoodieRecord>() { + @Override + public HoodieRecord call( + Tuple2>, HoodieRecord> tuple) + throws Exception { + return tuple._2(); + } + }); + + + // Perform the actual writing. + JavaRDD upsertStatusRDD = partitionedRecords.mapPartitionsWithIndex( + new Function2>, Iterator>>() { + @Override + public Iterator> call(Integer partition, + Iterator> recordItr) throws Exception { + return table.handleUpsertPartition(partition, recordItr, upsertPartitioner); + } + }, true).flatMap(new FlatMapFunction, WriteStatus>() { + @Override + public Iterable call(List writeStatuses) + throws Exception { + return writeStatuses; + } + }); + + // Update the index back. + JavaRDD resultRDD = index.updateLocation(upsertStatusRDD, metadata); + resultRDD = resultRDD.persist(config.getWriteStatusStorageLevel()); + boolean commitResult = commit(commitTime, resultRDD); + if (!commitResult) { + throw new HoodieCommitException("Failed to commit " + commitTime); + } + return resultRDD; + } catch (Throwable e) { + if (e instanceof HoodieUpsertException) { + throw (HoodieUpsertException) e; + } + throw new HoodieUpsertException("Failed to upsert for commit time " + commitTime, e); + } + } + + private JavaRDD> combineOnCondition(boolean condition, + JavaRDD> records, int parallelism) { + if(condition) { + return deduplicateRecords(records, parallelism); + } + return records; + } + + /** + * Loads the given HoodieRecords, as inserts into the table. + * (This implementation uses sortBy and attempts to control the numbers of files with less memory) + * + * @param records HoodieRecords to insert + * @param commitTime Commit Time handle + * @return JavaRDD - RDD of WriteStatus to inspect errors and counts + * + */ + public JavaRDD insert(JavaRDD> records, final String commitTime) { + final HoodieTableMetadata metadata = + new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + writeContext = metrics.getCommitCtx(); + try { + // De-dupe/merge if needed + JavaRDD> dedupedRecords = + combineOnCondition(config.shouldCombineBeforeInsert(), records, + config.getInsertShuffleParallelism()); + + // Now, sort the records and line them up nicely for loading. + JavaRDD> sortedRecords = + dedupedRecords.sortBy(new Function, String>() { + @Override + public String call(HoodieRecord record) { + // Let's use "partitionPath + key" as the sort key. Spark, will ensure + // the records split evenly across RDD partitions, such that small partitions fit + // into 1 RDD partition, while big ones spread evenly across multiple RDD partitions + return String + .format("%s+%s", record.getPartitionPath(), record.getRecordKey()); + } + }, true, config.getInsertShuffleParallelism()); + JavaRDD writeStatusRDD = sortedRecords + .mapPartitionsWithIndex(new InsertMapFunction(commitTime, config, metadata), + true).flatMap(new FlatMapFunction, WriteStatus>() { + @Override + public Iterable call(List writeStatuses) + throws Exception { + return writeStatuses; + } + }); + // Update the index back + JavaRDD statuses = index.updateLocation(writeStatusRDD, metadata); + // Trigger the insert and collect statuses + statuses = statuses.persist(config.getWriteStatusStorageLevel()); + boolean commitResult = commit(commitTime, statuses); + if (!commitResult) { + throw new HoodieCommitException("Failed to commit " + commitTime); + } + return statuses; + } catch (Throwable e) { + if (e instanceof HoodieInsertException) { + throw e; + } + throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e); + } + } + + /** + * Commit changes performed at the given commitTime marker + */ + private boolean commit(String commitTime, JavaRDD writeStatuses) { + Path commitFile = + new Path(config.getBasePath() + "/.hoodie/" + FSUtils.makeCommitFileName(commitTime)); + try { + + if (fs.exists(commitFile)) { + throw new HoodieCommitException("Duplicate commit found. " + commitTime); + } + + List> stats = + writeStatuses.mapToPair(new PairFunction() { + @Override + public Tuple2 call(WriteStatus writeStatus) + throws Exception { + return new Tuple2<>(writeStatus.getPartitionPath(), writeStatus.getStat()); + } + }).collect(); + + HoodieCommitMetadata metadata = new HoodieCommitMetadata(); + for (Tuple2 stat : stats) { + metadata.addWriteStat(stat._1(), stat._2()); + } + + // open a new file and write the commit metadata in + Path inflightCommitFile = new Path(config.getBasePath() + "/.hoodie/" + FSUtils + .makeInflightCommitFileName(commitTime)); + FSDataOutputStream fsout = fs.create(inflightCommitFile, true); + fsout.writeBytes(new String(metadata.toJsonString().getBytes(StandardCharsets.UTF_8), + StandardCharsets.UTF_8)); + fsout.close(); + + boolean success = fs.rename(inflightCommitFile, commitFile); + if (success) { + // We cannot have unbounded commit files. Archive commits if we have to archive + archiveLog.archiveIfRequired(); + // Call clean to cleanup if there is anything to cleanup after the commit, + clean(); + if(writeContext != null) { + long durationInMs = metrics.getDurationInMs(writeContext.stop()); + metrics.updateCommitMetrics(FORMATTER.parse(commitTime).getTime(), durationInMs, + metadata); + writeContext = null; + } + } + return success; + } catch (IOException e) { + throw new HoodieCommitException( + "Failed to commit " + config.getBasePath() + " at time " + commitTime, e); + } catch (ParseException e) { + throw new HoodieCommitException( + "Commit time is not of valid format.Failed to commit " + config.getBasePath() + + " at time " + commitTime, e); + } + } + + /** + * Rollback the (inflight/committed) record changes with the given commit time. + * Three steps: + * (0) Obtain the commit or rollback file + * (1) clean indexing data, + * (2) clean new generated parquet files. + * (3) Finally delete .commit or .inflight file, + */ + public boolean rollback(final String commitTime) throws HoodieRollbackException { + + final Timer.Context context = metrics.getRollbackCtx(); + final HoodieTableMetadata metadata = + new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + final String metaPath = config.getBasePath() + "/" + HoodieTableMetadata.METAFOLDER_NAME; + try { + // 0. Obtain the commit/.inflight file, to work on + FileStatus[] commitFiles = + fs.globStatus(new Path(metaPath + "/" + commitTime + ".*")); + if (commitFiles.length != 1) { + throw new HoodieRollbackException("Expected exactly one .commit or .inflight file for commitTime: " + commitTime); + } + + // we first need to unpublish the commit by making it .inflight again. (this will ensure no future queries see this data) + Path filePath = commitFiles[0].getPath(); + if (filePath.getName().endsWith(HoodieTableMetadata.COMMIT_FILE_SUFFIX)) { + if (metadata.findCommitsAfter(commitTime, Integer.MAX_VALUE).size() > 0) { + throw new HoodieRollbackException("Found commits after time :" + commitTime + + ", please rollback greater commits first"); + } + Path newInflightPath = new Path(metaPath + "/" + commitTime + HoodieTableMetadata.INFLIGHT_FILE_SUFFIX); + if (!fs.rename(filePath, newInflightPath)) { + throw new HoodieRollbackException("Unable to rename .commit file to .inflight for commitTime:" + commitTime); + } + filePath = newInflightPath; + } + + // 1. Revert the index changes + logger.info("Clean out index changes at time: " + commitTime); + if (!index.rollbackCommit(commitTime)) { + throw new HoodieRollbackException("Clean out index changes failed, for time :" + commitTime); + } + + // 2. Delete the new generated parquet files + logger.info("Clean out all parquet files generated at time: " + commitTime); + final Accumulator numFilesDeletedAccu = jsc.accumulator(0); + jsc.parallelize(FSUtils.getAllPartitionPaths(fs, metadata.getBasePath())) + .foreach(new VoidFunction() { + @Override + public void call(String partitionPath) throws Exception { + // Scan all partitions files with this commit time + FileSystem fs = FSUtils.getFs(); + FileStatus[] toBeDeleted = + fs.listStatus(new Path(config.getBasePath(), partitionPath), + new PathFilter() { + @Override + public boolean accept(Path path) { + return commitTime + .equals(FSUtils.getCommitTime(path.getName())); + } + }); + for (FileStatus file : toBeDeleted) { + boolean success = fs.delete(file.getPath(), false); + logger.info("Delete file " + file.getPath() + "\t" + success); + if (success) { + numFilesDeletedAccu.add(1); + } + } + } + }); + + // 3. Clean out metadata (.commit or .tmp) + logger.info("Clean out metadata files at time: " + commitTime); + if (!fs.delete(filePath, false)) { + logger.error("Deleting file " + filePath + " failed."); + throw new HoodieRollbackException("Delete file " + filePath + " failed."); + } + + if (context != null) { + long durationInMs = metrics.getDurationInMs(context.stop()); + int numFilesDeleted = numFilesDeletedAccu.value(); + metrics.updateRollbackMetrics(durationInMs, numFilesDeleted); + } + + return true; + } catch (IOException e) { + throw new HoodieRollbackException("Failed to rollback " + + config.getBasePath() + " at commit time" + commitTime, e); + } + } + + /** + * Releases any resources used by the client. + */ + public void close() { + // UNDER CONSTRUCTION + } + + /** + * Clean up any stale/old files/data lying around (either on file storage or index storage) + */ + private void clean() throws HoodieIOException { + try { + logger.info("Cleaner started"); + final Timer.Context context = metrics.getCleanCtx(); + final HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + List partitionsToClean = FSUtils.getAllPartitionPaths(fs, metadata.getBasePath()); + // shuffle to distribute cleaning work across partitions evenly + Collections.shuffle(partitionsToClean); + logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config.getCleanerPolicy()); + if(partitionsToClean.isEmpty()) { + logger.info("Nothing to clean here mom. It is already clean"); + return; + } + + int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism()); + int numFilesDeleted = jsc.parallelize(partitionsToClean, cleanerParallelism) + .map(new Function() { + @Override + public Integer call(String partitionPathToClean) throws Exception { + FileSystem fs = FSUtils.getFs(); + HoodieCleaner cleaner = new HoodieCleaner(metadata, config, fs); + return cleaner.clean(partitionPathToClean); + } + }).reduce(new Function2() { + @Override + public Integer call(Integer v1, Integer v2) throws Exception { + return v1 + v2; + } + }); + logger.info("Cleaned " + numFilesDeleted + " files"); + // Emit metrics (duration, numFilesDeleted) if needed + if (context != null) { + long durationInMs = metrics.getDurationInMs(context.stop()); + logger.info("cleanerElaspsedTime (Minutes): " + durationInMs / (1000 * 60)); + metrics.updateCleanMetrics(durationInMs, numFilesDeleted); + } + } catch (IOException e) { + throw new HoodieIOException("Failed to clean up after commit", e); + } + } + + /** + * Provides a new commit time for a write operation (insert/update) + */ + public String startCommit() { + String commitTime = FORMATTER.format(new Date()); + startCommitWithTime(commitTime); + return commitTime; + } + + public void startCommitWithTime(String commitTime) { + logger.info("Generate a new commit time " + commitTime); + // Create the in-flight commit file + Path inflightCommitFilePath = new Path( + config.getBasePath() + "/.hoodie/" + FSUtils.makeInflightCommitFileName(commitTime)); + try { + if (fs.createNewFile(inflightCommitFilePath)) { + logger.info("Create an inflight commit file " + inflightCommitFilePath); + return; + } + throw new HoodieCommitException( + "Failed to create the inflight commit file " + inflightCommitFilePath); + } catch (IOException e) { + // handled below + throw new HoodieCommitException( + "Failed to create the inflight commit file " + inflightCommitFilePath, e); + } + } + + public static SparkConf registerClasses(SparkConf conf) { + conf.registerKryoClasses(new Class[]{HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class}); + return conf; + } + + /** + * Deduplicate Hoodie records, using the given deduplication funciton. + */ + private JavaRDD> deduplicateRecords(JavaRDD> records, int parallelism) { + return records.mapToPair(new PairFunction, HoodieKey, HoodieRecord>() { + @Override + public Tuple2> call(HoodieRecord record) { + return new Tuple2<>(record.getKey(), record); + } + }).reduceByKey(new Function2, HoodieRecord, HoodieRecord>() { + @Override + public HoodieRecord call(HoodieRecord rec1, HoodieRecord rec2) { + @SuppressWarnings("unchecked") + T reducedData = (T) rec1.getData().preCombine(rec2.getData()); + // we cannot allow the user to change the key or partitionPath, since that will affect everything + // so pick it from one of the records. + return new HoodieRecord(rec1.getKey(), reducedData); + } + }, parallelism).map(new Function>, HoodieRecord>() { + @Override + public HoodieRecord call(Tuple2> recordTuple) { + return recordTuple._2(); + } + }); + } + + /** + * Cleanup all inflight commits + * @throws IOException + */ + private void rollbackInflightCommits() { + final HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath(), config.getTableName()); + for (String commit : metadata.getAllInflightCommits()) { + rollback(commit); + } + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java b/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java new file mode 100644 index 000000000..94acbff3c --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/WriteStatus.java @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie; + +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieWriteStat; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * Status of a write operation. + */ +public class WriteStatus implements Serializable { + + private final HashMap errors = new HashMap<>(); + + private final List writtenRecords = new ArrayList<>(); + + private final List failedRecords = new ArrayList<>(); + + private Throwable globalError = null; + + private String fileId = null; + + private String partitionPath = null; + + private HoodieWriteStat stat = null; + + private long totalRecords = 0; + private long totalErrorRecords = 0; + + public void markSuccess(HoodieRecord record) { + writtenRecords.add(record); + totalRecords++; + } + + public void markFailure(HoodieRecord record, Throwable t) { + failedRecords.add(record); + errors.put(record.getKey(), t); + totalRecords++; + totalErrorRecords++; + } + + public String getFileId() { + return fileId; + } + + public void setFileId(String fileId) { + this.fileId = fileId; + } + + public boolean hasErrors() { + return totalErrorRecords > 0; + } + + public boolean isErrored(HoodieKey key) { + return errors.containsKey(key); + } + + public HashMap getErrors() { + return errors; + } + + public boolean hasGlobalError() { + return globalError != null; + } + + public void setGlobalError(Throwable t) { + this.globalError = t; + } + + public Throwable getGlobalError() { + return this.globalError; + } + + public List getWrittenRecords() { + return writtenRecords; + } + + public List getFailedRecords() { + return failedRecords; + } + + public HoodieWriteStat getStat() { + return stat; + } + + public void setStat(HoodieWriteStat stat) { + this.stat = stat; + } + + public String getPartitionPath() { + return partitionPath; + } + + public void setPartitionPath(String partitionPath) { + this.partitionPath = partitionPath; + } + + public long getTotalRecords() { + return totalRecords; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WriteStatus {"); + sb.append("fileId=").append(fileId); + sb.append(", globalError='").append(globalError).append('\''); + sb.append(", hasErrors='").append(hasErrors()).append('\''); + sb.append(", errorCount='").append(totalErrorRecords).append('\''); + sb.append(", errorPct='").append((100.0 * totalErrorRecords) / totalRecords).append('\''); + sb.append('}'); + return sb.toString(); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java new file mode 100644 index 000000000..bf363a38e --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/DefaultHoodieConfig.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.config; + +import java.io.Serializable; +import java.util.Map; +import java.util.Properties; + +/** + * Default Way to load Hoodie config through a java.util.Properties + */ +public class DefaultHoodieConfig implements Serializable { + protected final Properties props; + public DefaultHoodieConfig(Properties props) { + this.props = props; + } + + public Properties getProps() { + return props; + } + + public static void setDefaultOnCondition(Properties props, boolean condition, String propName, + String defaultValue) { + if (condition) { + props.setProperty(propName, defaultValue); + } + } + + public static void setDefaultOnCondition(Properties props, boolean condition, DefaultHoodieConfig config) { + if (condition) { + props.putAll(config.getProps()); + } + } + +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java new file mode 100644 index 000000000..b6be693b8 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieCompactionConfig.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.config; + +import com.google.common.base.Preconditions; +import com.uber.hoodie.io.HoodieCleaner; + +import javax.annotation.concurrent.Immutable; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Compaction related config + */ +@Immutable +public class HoodieCompactionConfig extends DefaultHoodieConfig { + public static final String CLEANER_POLICY_PROP = "hoodie.cleaner.policy"; + private static final String DEFAULT_CLEANER_POLICY = + HoodieCleaner.CleaningPolicy.KEEP_LATEST_COMMITS.name(); + + public static final String CLEANER_FILE_VERSIONS_RETAINED_PROP = + "hoodie.cleaner.fileversions.retained"; + private static final String DEFAULT_CLEANER_FILE_VERSIONS_RETAINED = "3"; + + public static final String CLEANER_COMMITS_RETAINED_PROP = "hoodie.cleaner.commits.retained"; + private static final String DEFAULT_CLEANER_COMMITS_RETAINED = "24"; + + public static final String MAX_COMMITS_TO_KEEP = "hoodie.keep.max.commits"; + private static final String DEFAULT_MAX_COMMITS_TO_KEEP = String.valueOf(128); + public static final String MIN_COMMITS_TO_KEEP = "hoodie.keep.min.commits"; + private static final String DEFAULT_MIN_COMMITS_TO_KEEP = String.valueOf(96); + // Upsert uses this file size to compact new data onto existing files.. + public static final String PARQUET_SMALL_FILE_LIMIT_BYTES = "hoodie.parquet.small.file.limit"; + // Turned off by default + public static final String DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES = String.valueOf(0); + + + /** Configs related to specific table types **/ + // Number of inserts, that will be put each partition/bucket for writing + public static final String COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = "hoodie.copyonwrite.insert.split.size"; + // The rationale to pick the insert parallelism is the following. Writing out 100MB files, + // with atleast 1kb records, means 100K records per file. we just overprovision to 500K + public static final String DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = String.valueOf(500000); + + // Config to control whether we control insert split sizes automatically based on average record sizes + public static final String COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = "hoodie.copyonwrite.insert.auto.split"; + // its off by default + public static final String DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = String.valueOf(false); + + + // This value is used as a guessimate for the record size, if we can't determine this from previous commits + public static final String COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = "hoodie.copyonwrite.record.size.estimate"; + // Used to determine how much more can be packed into a small file, before it exceeds the size limit. + public static final String DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = String.valueOf(1024); + + public static final String CLEANER_PARALLELISM = "hoodie.cleaner.parallelism"; + public static final String DEFAULT_CLEANER_PARALLELISM = String.valueOf(200); + + + private HoodieCompactionConfig(Properties props) { + super(props); + } + + public static HoodieCompactionConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } + } + + public Builder withCleanerPolicy(HoodieCleaner.CleaningPolicy policy) { + props.setProperty(CLEANER_POLICY_PROP, policy.name()); + return this; + } + + public Builder retainFileVersions(int fileVersionsRetained) { + props.setProperty(CLEANER_FILE_VERSIONS_RETAINED_PROP, + String.valueOf(fileVersionsRetained)); + return this; + } + + public Builder retainCommits(int commitsRetained) { + props.setProperty(CLEANER_COMMITS_RETAINED_PROP, String.valueOf(commitsRetained)); + return this; + } + + public Builder archiveCommitsWith(int minToKeep, int maxToKeep) { + props.setProperty(MIN_COMMITS_TO_KEEP, String.valueOf(minToKeep)); + props.setProperty(MAX_COMMITS_TO_KEEP, String.valueOf(maxToKeep)); + return this; + } + + public Builder compactionSmallFileSize(long smallFileLimitBytes) { + props.setProperty(PARQUET_SMALL_FILE_LIMIT_BYTES, String.valueOf(smallFileLimitBytes)); + return this; + } + + public Builder insertSplitSize(int insertSplitSize) { + props.setProperty(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, String.valueOf(insertSplitSize)); + return this; + } + + public Builder autoTuneInsertSplits(boolean autoTuneInsertSplits) { + props.setProperty(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, String.valueOf(autoTuneInsertSplits)); + return this; + } + + public Builder approxRecordSize(int recordSizeEstimate) { + props.setProperty(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, String.valueOf(recordSizeEstimate)); + return this; + } + + public Builder withCleanerParallelism(int cleanerParallelism) { + props.setProperty(CLEANER_PARALLELISM, String.valueOf(cleanerParallelism)); + return this; + } + + public HoodieCompactionConfig build() { + HoodieCompactionConfig config = new HoodieCompactionConfig(props); + setDefaultOnCondition(props, !props.containsKey(CLEANER_POLICY_PROP), + CLEANER_POLICY_PROP, DEFAULT_CLEANER_POLICY); + setDefaultOnCondition(props, !props.containsKey(CLEANER_FILE_VERSIONS_RETAINED_PROP), + CLEANER_FILE_VERSIONS_RETAINED_PROP, DEFAULT_CLEANER_FILE_VERSIONS_RETAINED); + setDefaultOnCondition(props, !props.containsKey(CLEANER_COMMITS_RETAINED_PROP), + CLEANER_COMMITS_RETAINED_PROP, DEFAULT_CLEANER_COMMITS_RETAINED); + setDefaultOnCondition(props, !props.containsKey(MAX_COMMITS_TO_KEEP), + MAX_COMMITS_TO_KEEP, DEFAULT_MAX_COMMITS_TO_KEEP); + setDefaultOnCondition(props, !props.containsKey(MIN_COMMITS_TO_KEEP), + MIN_COMMITS_TO_KEEP, DEFAULT_MIN_COMMITS_TO_KEEP); + setDefaultOnCondition(props, !props.containsKey(PARQUET_SMALL_FILE_LIMIT_BYTES), + PARQUET_SMALL_FILE_LIMIT_BYTES, DEFAULT_PARQUET_SMALL_FILE_LIMIT_BYTES); + setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE), + COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE, DEFAULT_COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE); + setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS), + COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS, DEFAULT_COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS); + setDefaultOnCondition(props, !props.containsKey(COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE), + COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE, DEFAULT_COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE); + setDefaultOnCondition(props, !props.containsKey(CLEANER_PARALLELISM), + CLEANER_PARALLELISM, DEFAULT_CLEANER_PARALLELISM); + + HoodieCleaner.CleaningPolicy.valueOf(props.getProperty(CLEANER_POLICY_PROP)); + Preconditions.checkArgument( + Integer.parseInt(props.getProperty(MAX_COMMITS_TO_KEEP)) > Integer + .parseInt(props.getProperty(MIN_COMMITS_TO_KEEP))); + return config; + } + + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java new file mode 100644 index 000000000..40f4f4521 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieIndexConfig.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.config; + +import com.google.common.base.Preconditions; +import com.uber.hoodie.index.HoodieIndex; + +import javax.annotation.concurrent.Immutable; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Indexing related config + */ +@Immutable +public class HoodieIndexConfig extends DefaultHoodieConfig { + public static final String INDEX_TYPE_PROP = "hoodie.index.type"; + public static final String DEFAULT_INDEX_TYPE = HoodieIndex.IndexType.BLOOM.name(); + public static final String BLOOM_FILTER_NUM_ENTRIES = "hoodie.index.bloom.num_entries"; + public static final String DEFAULT_BLOOM_FILTER_NUM_ENTRIES = "60000"; + public static final String BLOOM_FILTER_FPP = "hoodie.index.bloom.fpp"; + public static final String DEFAULT_BLOOM_FILTER_FPP = "0.000000001"; + public final static String HBASE_ZKQUORUM_PROP = "hoodie.index.hbase.zkquorum"; + public final static String HBASE_ZKPORT_PROP = "hoodie.index.hbase.zkport"; + public final static String HBASE_TABLENAME_PROP = "hoodie.index.hbase.table"; + + private HoodieIndexConfig(Properties props) { + super(props); + } + + public static HoodieIndexConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } + } + + public Builder withIndexType(HoodieIndex.IndexType indexType) { + props.setProperty(INDEX_TYPE_PROP, indexType.name()); + return this; + } + + public Builder bloomFilterNumEntries(int numEntries) { + props.setProperty(BLOOM_FILTER_NUM_ENTRIES, String.valueOf(numEntries)); + return this; + } + + public Builder bloomFilterFPP(double fpp) { + props.setProperty(BLOOM_FILTER_FPP, String.valueOf(fpp)); + return this; + } + + public Builder hbaseZkQuorum(String zkString) { + props.setProperty(HBASE_ZKQUORUM_PROP, zkString); + return this; + } + + public Builder hbaseZkPort(int port) { + props.setProperty(HBASE_ZKPORT_PROP, String.valueOf(port)); + return this; + } + + public Builder hbaseTableName(String tableName) { + props.setProperty(HBASE_TABLENAME_PROP, tableName); + return this; + } + + public HoodieIndexConfig build() { + HoodieIndexConfig config = new HoodieIndexConfig(props); + setDefaultOnCondition(props, !props.containsKey(INDEX_TYPE_PROP), + INDEX_TYPE_PROP, DEFAULT_INDEX_TYPE); + setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_NUM_ENTRIES), + BLOOM_FILTER_NUM_ENTRIES, DEFAULT_BLOOM_FILTER_NUM_ENTRIES); + setDefaultOnCondition(props, !props.containsKey(BLOOM_FILTER_FPP), + BLOOM_FILTER_FPP, DEFAULT_BLOOM_FILTER_FPP); + // Throws IllegalArgumentException if the value set is not a known Hoodie Index Type + HoodieIndex.IndexType.valueOf(props.getProperty(INDEX_TYPE_PROP)); + return config; + } + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java new file mode 100644 index 000000000..48dcdfe3c --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieMetricsConfig.java @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.config; + +import com.uber.hoodie.metrics.MetricsReporterType; + +import javax.annotation.concurrent.Immutable; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Fetch the configurations used by the Metrics system. + */ +@Immutable +public class HoodieMetricsConfig extends DefaultHoodieConfig { + + public final static String METRIC_PREFIX = "hoodie.metrics"; + public final static String METRICS_ON = METRIC_PREFIX + ".on"; + public final static boolean DEFAULT_METRICS_ON = false; + public final static String METRICS_REPORTER_TYPE = METRIC_PREFIX + ".reporter.type"; + public final static MetricsReporterType DEFAULT_METRICS_REPORTER_TYPE = + MetricsReporterType.GRAPHITE; + + // Graphite + public final static String GRAPHITE_PREFIX = METRIC_PREFIX + ".graphite"; + public final static String GRAPHITE_SERVER_HOST = GRAPHITE_PREFIX + ".host"; + public final static String DEFAULT_GRAPHITE_SERVER_HOST = "localhost"; + + public final static String GRAPHITE_SERVER_PORT = GRAPHITE_PREFIX + ".port"; + public final static int DEFAULT_GRAPHITE_SERVER_PORT = 4756; + + public final static String GRAPHITE_METRIC_PREFIX = GRAPHITE_PREFIX + ".metric.prefix"; + + private HoodieMetricsConfig(Properties props) { + super(props); + } + + public static HoodieMetricsConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } + } + + public Builder on(boolean metricsOn) { + props.setProperty(METRICS_ON, String.valueOf(metricsOn)); + return this; + } + + public Builder withReporterType(String reporterType) { + props.setProperty(METRICS_REPORTER_TYPE, reporterType); + return this; + } + + public Builder toGraphiteHost(String host) { + props.setProperty(GRAPHITE_SERVER_HOST, host); + return this; + } + + public Builder onGraphitePort(int port) { + props.setProperty(GRAPHITE_SERVER_PORT, String.valueOf(port)); + return this; + } + + public Builder usePrefix(String prefix) { + props.setProperty(GRAPHITE_METRIC_PREFIX, prefix); + return this; + } + + public HoodieMetricsConfig build() { + HoodieMetricsConfig config = new HoodieMetricsConfig(props); + setDefaultOnCondition(props, !props.containsKey(METRICS_ON), METRICS_ON, + String.valueOf(DEFAULT_METRICS_ON)); + setDefaultOnCondition(props, !props.containsKey(METRICS_REPORTER_TYPE), + METRICS_REPORTER_TYPE, DEFAULT_METRICS_REPORTER_TYPE.name()); + setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_HOST), + GRAPHITE_SERVER_HOST, DEFAULT_GRAPHITE_SERVER_HOST); + setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), + GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); + setDefaultOnCondition(props, !props.containsKey(GRAPHITE_SERVER_PORT), + GRAPHITE_SERVER_PORT, String.valueOf(DEFAULT_GRAPHITE_SERVER_PORT)); + return config; + } + } + +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java new file mode 100644 index 000000000..50ed232cf --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieStorageConfig.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.config; + +import javax.annotation.concurrent.Immutable; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Storage related config + */ +@Immutable +public class HoodieStorageConfig extends DefaultHoodieConfig { + public static final String PARQUET_FILE_MAX_BYTES = "hoodie.parquet.max.file.size"; + public static final String DEFAULT_PARQUET_FILE_MAX_BYTES = String.valueOf(120 * 1024 * 1024); + public static final String PARQUET_BLOCK_SIZE_BYTES = "hoodie.parquet.block.size"; + public static final String DEFAULT_PARQUET_BLOCK_SIZE_BYTES = DEFAULT_PARQUET_FILE_MAX_BYTES; + public static final String PARQUET_PAGE_SIZE_BYTES = "hoodie.parquet.page.size"; + public static final String DEFAULT_PARQUET_PAGE_SIZE_BYTES = String.valueOf(1 * 1024 * 1024); + + private HoodieStorageConfig(Properties props) { + super(props); + } + + public static HoodieStorageConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private final Properties props = new Properties(); + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } + } + + public Builder limitFileSize(int maxFileSize) { + props.setProperty(PARQUET_FILE_MAX_BYTES, String.valueOf(maxFileSize)); + return this; + } + + public Builder parquetBlockSize(int blockSize) { + props.setProperty(PARQUET_BLOCK_SIZE_BYTES, String.valueOf(blockSize)); + return this; + } + + public Builder parquetPageSize(int pageSize) { + props.setProperty(PARQUET_PAGE_SIZE_BYTES, String.valueOf(pageSize)); + return this; + } + + public HoodieStorageConfig build() { + HoodieStorageConfig config = new HoodieStorageConfig(props); + setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES), + PARQUET_FILE_MAX_BYTES, DEFAULT_PARQUET_FILE_MAX_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_BLOCK_SIZE_BYTES), + PARQUET_BLOCK_SIZE_BYTES, DEFAULT_PARQUET_BLOCK_SIZE_BYTES); + setDefaultOnCondition(props, !props.containsKey(PARQUET_PAGE_SIZE_BYTES), + PARQUET_PAGE_SIZE_BYTES, DEFAULT_PARQUET_PAGE_SIZE_BYTES); + return config; + } + } + +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java new file mode 100644 index 000000000..c050294fe --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/config/HoodieWriteConfig.java @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.config; + + +import com.google.common.base.Preconditions; +import com.uber.hoodie.index.HoodieIndex; +import com.uber.hoodie.io.HoodieCleaner; +import com.uber.hoodie.metrics.MetricsReporterType; +import org.apache.spark.storage.StorageLevel; + +import javax.annotation.concurrent.Immutable; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Properties; + +/** + * Class storing configs for the {@link com.uber.hoodie.HoodieWriteClient} + */ +@Immutable +public class HoodieWriteConfig extends DefaultHoodieConfig { + private static final String BASE_PATH_PROP = "hoodie.base.path"; + private static final String AVRO_SCHEMA = "hoodie.avro.schema"; + private static final String TABLE_NAME = "hoodie.table.name"; + private static final String DEFAULT_PARALLELISM = "200"; + private static final String INSERT_PARALLELISM = "hoodie.insert.shuffle.parallelism"; + private static final String UPSERT_PARALLELISM = "hoodie.upsert.shuffle.parallelism"; + private static final String COMBINE_BEFORE_INSERT_PROP = "hoodie.combine.before.insert"; + private static final String DEFAULT_COMBINE_BEFORE_INSERT = "false"; + private static final String COMBINE_BEFORE_UPSERT_PROP = "hoodie.combine.before.upsert"; + private static final String DEFAULT_COMBINE_BEFORE_UPSERT = "true"; + private static final String WRITE_STATUS_STORAGE_LEVEL = "hoodie.write.status.storage.level"; + private static final String DEFAULT_WRITE_STATUS_STORAGE_LEVEL = "MEMORY_AND_DISK_SER"; + + private HoodieWriteConfig(Properties props) { + super(props); + } + + /** + * base properties + **/ + public String getBasePath() { + return props.getProperty(BASE_PATH_PROP); + } + + public String getSchema() { + return props.getProperty(AVRO_SCHEMA); + } + + public String getTableName() { + return props.getProperty(TABLE_NAME); + } + + public int getInsertShuffleParallelism() { + return Integer.parseInt(props.getProperty(INSERT_PARALLELISM)); + } + + public int getUpsertShuffleParallelism() { + return Integer.parseInt(props.getProperty(UPSERT_PARALLELISM)); + } + + public boolean shouldCombineBeforeInsert() { + return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_INSERT_PROP)); + } + + public boolean shouldCombineBeforeUpsert() { + return Boolean.parseBoolean(props.getProperty(COMBINE_BEFORE_UPSERT_PROP)); + } + + public StorageLevel getWriteStatusStorageLevel() { + return StorageLevel.fromString(props.getProperty(WRITE_STATUS_STORAGE_LEVEL)); + } + + /** + * compaction properties + **/ + public HoodieCleaner.CleaningPolicy getCleanerPolicy() { + return HoodieCleaner.CleaningPolicy + .valueOf(props.getProperty(HoodieCompactionConfig.CLEANER_POLICY_PROP)); + } + + public int getCleanerFileVersionsRetained() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.CLEANER_FILE_VERSIONS_RETAINED_PROP)); + } + + public int getCleanerCommitsRetained() { + return Integer + .parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED_PROP)); + } + + public int getMaxCommitsToKeep() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP)); + } + + public int getMinCommitsToKeep() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP)); + } + + public int getParquetSmallFileLimit() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT_BYTES)); + } + + public int getCopyOnWriteInsertSplitSize() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE)); + } + + public int getCopyOnWriteRecordSizeEstimate() { + return Integer.parseInt( + props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE)); + } + + public boolean shouldAutoTuneInsertSplits() { + return Boolean.parseBoolean( + props.getProperty(HoodieCompactionConfig.COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS)); + } + + public int getCleanerParallelism() { + return Integer.parseInt(props.getProperty(HoodieCompactionConfig.CLEANER_PARALLELISM)); + } + + /** + * index properties + **/ + public HoodieIndex.IndexType getIndexType() { + return HoodieIndex.IndexType.valueOf(props.getProperty(HoodieIndexConfig.INDEX_TYPE_PROP)); + } + + public int getBloomFilterNumEntries() { + return Integer.parseInt(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES)); + } + + public double getBloomFilterFPP() { + return Double.parseDouble(props.getProperty(HoodieIndexConfig.BLOOM_FILTER_FPP)); + } + + public String getHbaseZkQuorum() { + return props.getProperty(HoodieIndexConfig.HBASE_ZKQUORUM_PROP); + } + + public int getHbaseZkPort() { + return Integer.parseInt(props.getProperty(HoodieIndexConfig.HBASE_ZKPORT_PROP)); + } + + public String getHbaseTableName() { + return props.getProperty(HoodieIndexConfig.HBASE_TABLENAME_PROP); + } + + /** + * storage properties + **/ + public int getParquetMaxFileSize() { + return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_FILE_MAX_BYTES)); + } + + public int getParquetBlockSize() { + return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_BLOCK_SIZE_BYTES)); + } + + public int getParquetPageSize() { + return Integer.parseInt(props.getProperty(HoodieStorageConfig.PARQUET_PAGE_SIZE_BYTES)); + } + + /** + * metrics properties + **/ + public boolean isMetricsOn() { + return Boolean.parseBoolean(props.getProperty(HoodieMetricsConfig.METRICS_ON)); + } + + public MetricsReporterType getMetricsReporterType() { + return MetricsReporterType + .valueOf(props.getProperty(HoodieMetricsConfig.METRICS_REPORTER_TYPE)); + } + + public String getGraphiteServerHost() { + return props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_HOST); + } + + public int getGraphiteServerPort() { + return Integer.parseInt(props.getProperty(HoodieMetricsConfig.GRAPHITE_SERVER_PORT)); + } + + public String getGraphiteMetricPrefix() { + return props.getProperty(HoodieMetricsConfig.GRAPHITE_METRIC_PREFIX); + } + + public static HoodieWriteConfig.Builder newBuilder() { + return new Builder(); + } + + public static class Builder { + private final Properties props = new Properties(); + private boolean isIndexConfigSet = false; + private boolean isStorageConfigSet = false; + private boolean isCompactionConfigSet = false; + private boolean isMetricsConfigSet = false; + + public Builder fromFile(File propertiesFile) throws IOException { + FileReader reader = new FileReader(propertiesFile); + try { + this.props.load(reader); + return this; + } finally { + reader.close(); + } + } + + + public Builder withPath(String basePath) { + props.setProperty(BASE_PATH_PROP, basePath); + return this; + } + + public Builder withSchema(String schemaStr) { + props.setProperty(AVRO_SCHEMA, schemaStr); + return this; + } + + public Builder forTable(String tableName) { + props.setProperty(TABLE_NAME, tableName); + return this; + } + + public Builder withParallelism(int insertShuffleParallelism, int upsertShuffleParallelism) { + props.setProperty(INSERT_PARALLELISM, String.valueOf(insertShuffleParallelism)); + props.setProperty(UPSERT_PARALLELISM, String.valueOf(upsertShuffleParallelism)); + return this; + } + + public Builder combineInput(boolean onInsert, boolean onUpsert) { + props.setProperty(COMBINE_BEFORE_INSERT_PROP, String.valueOf(onInsert)); + props.setProperty(COMBINE_BEFORE_UPSERT_PROP, String.valueOf(onUpsert)); + return this; + } + + public Builder withWriteStatusStorageLevel(StorageLevel level) { + props.setProperty(WRITE_STATUS_STORAGE_LEVEL, level.toString()); + return this; + } + + public Builder withIndexConfig(HoodieIndexConfig indexConfig) { + props.putAll(indexConfig.getProps()); + isIndexConfigSet = true; + return this; + } + + public Builder withStorageConfig(HoodieStorageConfig storageConfig) { + props.putAll(storageConfig.getProps()); + isStorageConfigSet = true; + return this; + } + + public Builder withCompactionConfig(HoodieCompactionConfig compactionConfig) { + props.putAll(compactionConfig.getProps()); + isCompactionConfigSet = true; + return this; + } + + public Builder withMetricsConfig(HoodieMetricsConfig metricsConfig) { + props.putAll(metricsConfig.getProps()); + isMetricsConfigSet = true; + return this; + } + + public HoodieWriteConfig build() { + HoodieWriteConfig config = new HoodieWriteConfig(props); + // Check for mandatory properties + Preconditions.checkArgument(config.getBasePath() != null); + setDefaultOnCondition(props, !props.containsKey(INSERT_PARALLELISM), INSERT_PARALLELISM, + DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(UPSERT_PARALLELISM), UPSERT_PARALLELISM, + DEFAULT_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), + COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT); + setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), + COMBINE_BEFORE_UPSERT_PROP, DEFAULT_COMBINE_BEFORE_UPSERT); + setDefaultOnCondition(props, !props.containsKey(WRITE_STATUS_STORAGE_LEVEL), + WRITE_STATUS_STORAGE_LEVEL, DEFAULT_WRITE_STATUS_STORAGE_LEVEL); + + + setDefaultOnCondition(props, !isIndexConfigSet, HoodieIndexConfig.newBuilder().build()); + setDefaultOnCondition(props, !isStorageConfigSet, + HoodieStorageConfig.newBuilder().build()); + setDefaultOnCondition(props, !isCompactionConfigSet, + HoodieCompactionConfig.newBuilder().build()); + setDefaultOnCondition(props, !isMetricsConfigSet, + HoodieMetricsConfig.newBuilder().build()); + return config; + } + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java new file mode 100644 index 000000000..bc4c139f5 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieCommitException.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.exception; + +/** + *

+ * Exception thrown for any higher level errors when HoodieClient is doing a Commit + *

+ */ +public class HoodieCommitException extends HoodieException { + public HoodieCommitException(String msg) { + super(msg); + } + + public HoodieCommitException(String msg, Throwable e) { + super(msg, e); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java new file mode 100644 index 000000000..4f64d76ca --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieDependentSystemUnavailableException.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.exception; + + +/** + *

+ * Exception thrown when dependent system is not available + *

+ */ +public class HoodieDependentSystemUnavailableException extends HoodieException { + public static final String HBASE = "HBASE"; + + public HoodieDependentSystemUnavailableException(String system, String connectURL) { + super(getLogMessage(system, connectURL)); + } + + private static String getLogMessage(String system, String connectURL) { + return "System " + system + " unavailable. Tried to connect to " + connectURL; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java new file mode 100644 index 000000000..a228541d3 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieInsertException.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.exception; + +import java.io.IOException; + +/** + *

+ * Exception thrown for any higher level errors when HoodieClient is doing a bulk insert + *

+ */ +public class HoodieInsertException extends HoodieException { + public HoodieInsertException(String msg, Throwable e) { + super(msg, e); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java new file mode 100644 index 000000000..67e4835a6 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieRollbackException.java @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.exception; + +public class HoodieRollbackException extends HoodieException { + + public HoodieRollbackException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieRollbackException(String msg) { + super(msg); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java new file mode 100644 index 000000000..16779a92b --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/exception/HoodieUpsertException.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.exception; + +/** + *

+ * Exception thrown for any higher level errors when HoodieClient is doing a incremental upsert + *

+ */ +public class HoodieUpsertException extends HoodieException { + public HoodieUpsertException(String msg, Throwable e) { + super(msg, e); + } + + public HoodieUpsertException(String msg) { + super(msg); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/InsertMapFunction.java b/hoodie-client/src/main/java/com/uber/hoodie/func/InsertMapFunction.java new file mode 100644 index 000000000..98703221e --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/InsertMapFunction.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.func; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import org.apache.spark.api.java.function.Function2; + +import java.util.Iterator; +import java.util.List; + + +/** + * Map function that handles a sorted stream of HoodieRecords + */ +public class InsertMapFunction + implements Function2>, Iterator>> { + + private String commitTime; + private HoodieWriteConfig config; + private HoodieTableMetadata metadata; + + public InsertMapFunction(String commitTime, HoodieWriteConfig config, + HoodieTableMetadata metadata) { + this.commitTime = commitTime; + this.config = config; + this.metadata = metadata; + } + + @Override + public Iterator> call(Integer partition, Iterator> sortedRecordItr) + throws Exception { + return new LazyInsertIterable<>(sortedRecordItr, config, commitTime, metadata); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java new file mode 100644 index 000000000..579191651 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.func; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; + +import com.uber.hoodie.io.HoodieIOHandle; +import com.uber.hoodie.io.HoodieInsertHandle; +import org.apache.spark.TaskContext; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +/** + * Lazy Iterable, that writes a stream of HoodieRecords sorted by the partitionPath, + * into new files. + */ +public class LazyInsertIterable extends LazyIterableIterator, List> { + + private final HoodieWriteConfig hoodieConfig; + private final String commitTime; + private final HoodieTableMetadata tableMetadata; + private Set partitionsCleaned; + private HoodieInsertHandle handle; + + public LazyInsertIterable(Iterator> sortedRecordItr, HoodieWriteConfig config, + String commitTime, HoodieTableMetadata metadata) { + super(sortedRecordItr); + this.partitionsCleaned = new HashSet<>(); + this.hoodieConfig = config; + this.commitTime = commitTime; + this.tableMetadata = metadata; + } + + @Override protected void start() { + } + + + @Override protected List computeNext() { + List statuses = new ArrayList<>(); + + while (inputItr.hasNext()) { + HoodieRecord record = inputItr.next(); + + // clean up any partial failures + if (!partitionsCleaned.contains(record.getPartitionPath())) { + // This insert task could fail multiple times, but Spark will faithfully retry with + // the same data again. Thus, before we open any files under a given partition, we + // first delete any files in the same partitionPath written by same Spark partition + HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, + commitTime, + record.getPartitionPath(), + TaskContext.getPartitionId()); + partitionsCleaned.add(record.getPartitionPath()); + } + + // lazily initialize the handle, for the first time + if (handle == null) { + handle = + new HoodieInsertHandle(hoodieConfig, commitTime, tableMetadata, + record.getPartitionPath()); + } + + if (handle.canWrite(record)) { + // write the record, if the handle has capacity + handle.write(record); + } else { + // handle is full. + statuses.add(handle.close()); + // Need to handle the rejected record & open new handle + handle = + new HoodieInsertHandle(hoodieConfig, commitTime, tableMetadata, + record.getPartitionPath()); + handle.write(record); // we should be able to write 1 record. + break; + } + } + + // If we exited out, because we ran out of records, just close the pending handle. + if (!inputItr.hasNext()) { + if (handle != null) { + statuses.add(handle.close()); + } + } + + assert statuses.size() > 0; // should never return empty statuses + return statuses; + } + + @Override protected void end() { + + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java new file mode 100644 index 000000000..195342f82 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyIterableIterator.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.func; + +import java.util.Iterator; + +/** + * (NOTE: Adapted from Apache SystemML) This class is a generic base class for lazy, single pass + * inputItr classes in order to simplify the implementation of lazy iterators for mapPartitions use + * cases. Note [SPARK-3369], which gives the reasons for backwards compatibility with regard to the + * iterable API despite Spark's single pass nature. + * + * Provide a way to obtain a inputItr of type O (output), out of an inputItr of type I (input) + * + * Things to remember: - Assumes Spark calls hasNext() to check for elements, before calling next() + * to obtain them - Assumes hasNext() gets called atleast once. - Concrete Implementation is + * responsible for calling inputIterator.next() and doing the processing in computeNext() + */ +public abstract class LazyIterableIterator implements Iterable, Iterator { + protected Iterator inputItr = null; + private boolean consumed = false; + private boolean startCalled = false; + private boolean endCalled = false; + + public LazyIterableIterator(Iterator in) { + inputItr = in; + } + + /** + * Called once, before any elements are processed + */ + protected abstract void start(); + + /** + * Block computation to be overwritten by sub classes. + */ + protected abstract O computeNext(); + + + /** + * Called once, after all elements are processed. + */ + protected abstract void end(); + + + ////////////////// + // iterable implementation + + private void invokeStartIfNeeded() { + if (!startCalled) { + startCalled = true; + try { + start(); + } catch (Exception e) { + throw new RuntimeException("Error in start()"); + } + } + } + + private void invokeEndIfNeeded() { + // make the calls out to begin() & end() + if (!endCalled) { + endCalled = true; + // if we are out of elements, and end has not been called yet + try { + end(); + } catch (Exception e) { + throw new RuntimeException("Error in end()"); + } + } + } + + @Override + public Iterator iterator() { + //check for consumed inputItr + if (consumed) + throw new RuntimeException("Invalid repeated inputItr consumption."); + + //hand out self as inputItr exactly once (note: do not hand out the input + //inputItr since it is consumed by the self inputItr implementation) + consumed = true; + return this; + } + + ////////////////// + // inputItr implementation + + @Override + public boolean hasNext() { + boolean ret = inputItr.hasNext(); + // make sure, there is exactly one call to start() + invokeStartIfNeeded(); + if (!ret) { + // if we are out of elements, and end has not been called yet + invokeEndIfNeeded(); + } + + return ret; + } + + @Override + public O next() { + try { + return computeNext(); + } catch (Exception ex) { + throw new RuntimeException(ex); + } + } + + @Override + public void remove() { + throw new RuntimeException("Unsupported remove operation."); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java new file mode 100644 index 000000000..98e889b65 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HBaseIndex.java @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.google.common.base.Optional; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieRecord; + +import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.exception.HoodieDependentSystemUnavailableException; +import com.uber.hoodie.exception.HoodieIndexException; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.*; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function2; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Hoodie Index implementation backed by HBase + */ +public class HBaseIndex extends HoodieIndex { + private final static byte[] SYSTEM_COLUMN_FAMILY = Bytes.toBytes("_s"); + private final static byte[] COMMIT_TS_COLUMN = Bytes.toBytes("commit_ts"); + private final static byte[] FILE_NAME_COLUMN = Bytes.toBytes("file_name"); + private final static byte[] PARTITION_PATH_COLUMN = Bytes.toBytes("partition_path"); + + private static Logger logger = LogManager.getLogger(HBaseIndex.class); + + private final String tableName; + + public HBaseIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + this.tableName = config.getProps().getProperty(HoodieIndexConfig.HBASE_TABLENAME_PROP); + } + + @Override + public JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, HoodieTableMetadata metadata) { + throw new UnsupportedOperationException("HBase index does not implement check exist yet"); + } + + private static Connection hbaseConnection = null; + + private Connection getHBaseConnection() { + Configuration hbaseConfig = HBaseConfiguration.create(); + String quorum = config.getProps().getProperty(HoodieIndexConfig.HBASE_ZKQUORUM_PROP); + hbaseConfig.set("hbase.zookeeper.quorum", quorum); + String port = config.getProps().getProperty(HoodieIndexConfig.HBASE_ZKPORT_PROP); + hbaseConfig.set("hbase.zookeeper.property.clientPort", port); + try { + return ConnectionFactory.createConnection(hbaseConfig); + } catch (IOException e) { + throw new HoodieDependentSystemUnavailableException( + HoodieDependentSystemUnavailableException.HBASE, quorum + ":" + port); + } + } + + /** + * Function that tags each HoodieRecord with an existing location, if known. + */ + class LocationTagFunction + implements Function2>, Iterator>> { + + private final HoodieTableMetadata metadata; + + LocationTagFunction(HoodieTableMetadata metadata) { + this.metadata = metadata; + } + + @Override + public Iterator> call(Integer partitionNum, + Iterator> hoodieRecordIterator) { + // Grab the global HBase connection + synchronized (HBaseIndex.class) { + if (hbaseConnection == null) { + hbaseConnection = getHBaseConnection(); + } + } + List> taggedRecords = new ArrayList<>(); + HTable hTable = null; + try { + hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); + // Do the tagging. + while (hoodieRecordIterator.hasNext()) { + HoodieRecord rec = hoodieRecordIterator.next(); + // TODO(vc): This may need to be a multi get. + Result result = hTable.get( + new Get(Bytes.toBytes(rec.getRecordKey())).setMaxVersions(1) + .addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) + .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN) + .addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); + + // first, attempt to grab location from HBase + if (result.getRow() != null) { + String commitTs = + Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); + String fileId = + Bytes.toString(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); + + // if the last commit ts for this row is less than the system commit ts + if (!metadata.isCommitsEmpty() && metadata.isCommitTsSafe(commitTs)) { + rec.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId)); + } + } + taggedRecords.add(rec); + } + } catch (IOException e) { + throw new HoodieIndexException( + "Failed to Tag indexed locations because of exception with HBase Client", e); + } + + finally { + if (hTable != null) { + try { + hTable.close(); + } catch (IOException e) { + // Ignore + } + } + + } + return taggedRecords.iterator(); + } + } + + @Override + public JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTableMetadata metadata) { + return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(metadata), true); + } + + class UpdateLocationTask implements Function2, Iterator> { + @Override + public Iterator call(Integer partition, Iterator statusIterator) { + + List writeStatusList = new ArrayList<>(); + // Grab the global HBase connection + synchronized (HBaseIndex.class) { + if (hbaseConnection == null) { + hbaseConnection = getHBaseConnection(); + } + } + HTable hTable = null; + try { + hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); + while (statusIterator.hasNext()) { + WriteStatus writeStatus = statusIterator.next(); + List puts = new ArrayList<>(); + try { + for (HoodieRecord rec : writeStatus.getWrittenRecords()) { + if (!writeStatus.isErrored(rec.getKey())) { + Put put = new Put(Bytes.toBytes(rec.getRecordKey())); + HoodieRecordLocation loc = rec.getNewLocation(); + put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, + Bytes.toBytes(loc.getCommitTime())); + put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, + Bytes.toBytes(loc.getFileId())); + put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, + Bytes.toBytes(rec.getPartitionPath())); + puts.add(put); + } + } + hTable.put(puts); + hTable.flushCommits(); + } catch (Exception e) { + Exception we = new Exception("Error updating index for " + writeStatus, e); + logger.error(we); + writeStatus.setGlobalError(we); + } + writeStatusList.add(writeStatus); + } + } catch (IOException e) { + throw new HoodieIndexException( + "Failed to Update Index locations because of exception with HBase Client", e); + } finally { + if (hTable != null) { + try { + hTable.close(); + } catch (IOException e) { + // Ignore + } + } + } + return writeStatusList.iterator(); + } + } + + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTableMetadata metadata) { + return writeStatusRDD.mapPartitionsWithIndex(new UpdateLocationTask(), true); + } + + @Override + public boolean rollbackCommit(String commitTime) { + // TODO (weiy) + return true; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java new file mode 100644 index 000000000..71fef152a --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndex.java @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Optional; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFlatMapFunction; +import org.apache.spark.api.java.function.PairFunction; + +import scala.Tuple2; +import java.util.*; + +/** + * Indexing mechanism based on bloom filter. Each parquet file includes its row_key bloom filter in + * its metadata. + */ +public class HoodieBloomIndex extends HoodieIndex { + + private static Logger logger = LogManager.getLogger(HoodieBloomIndex.class); + + // we need to limit the join such that it stays within 1.5GB per Spark partition. (SPARK-1476) + private static final int SPARK_MAXIMUM_BYTES_PER_PARTITION = 1500 * 1024 * 1024; + // this is how much a triplet of (partitionPath, fileId, recordKey) costs. + private static final int BYTES_PER_PARTITION_FILE_KEY_TRIPLET = 300; + private static int MAX_ITEMS_PER_JOIN_PARTITION = SPARK_MAXIMUM_BYTES_PER_PARTITION / BYTES_PER_PARTITION_FILE_KEY_TRIPLET; + + public HoodieBloomIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + } + + @Override + /** + * + */ + public JavaRDD> tagLocation(JavaRDD> recordRDD, final HoodieTableMetadata metadata) { + + // Step 1: Extract out thinner JavaPairRDD of (partitionPath, recordKey) + JavaPairRDD partitionRecordKeyPairRDD = recordRDD + .mapToPair(new PairFunction, String, String>() { + @Override + public Tuple2 call(HoodieRecord record) throws Exception { + return new Tuple2<>(record.getPartitionPath(), record.getRecordKey()); + } + }); + + // Lookup indexes for all the partition/recordkey pair + JavaPairRDD rowKeyFilenamePairRDD = + lookupIndex(partitionRecordKeyPairRDD, metadata); + + // Cache the result, for subsequent stages. + rowKeyFilenamePairRDD.cache(); + long totalTaggedRecords = rowKeyFilenamePairRDD.count(); + logger.info("Number of update records (ones tagged with a fileID): " + totalTaggedRecords); + + + // Step 4: Tag the incoming records, as inserts or updates, by joining with existing record keys + // Cost: 4 sec. + return tagLocationBacktoRecords(rowKeyFilenamePairRDD, recordRDD); + } + + public JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, final HoodieTableMetadata metadata) { + JavaPairRDD partitionRecordKeyPairRDD = + hoodieKeys.mapToPair(new PairFunction() { + @Override + public Tuple2 call(HoodieKey key) throws Exception { + return new Tuple2<>(key.getPartitionPath(), key.getRecordKey()); + } + }); + + // Lookup indexes for all the partition/recordkey pair + JavaPairRDD rowKeyFilenamePairRDD = + lookupIndex(partitionRecordKeyPairRDD, metadata); + + JavaPairRDD rowKeyHoodieKeyPairRDD = + hoodieKeys.mapToPair(new PairFunction() { + @Override + public Tuple2 call(HoodieKey key) throws Exception { + return new Tuple2<>(key.getRecordKey(), key); + } + }); + + return rowKeyHoodieKeyPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).mapToPair( + new PairFunction>>, HoodieKey, Optional>() { + @Override + public Tuple2> call( + Tuple2>> keyPathTuple) + throws Exception { + Optional recordLocationPath; + if (keyPathTuple._2._2.isPresent()) { + String fileName = keyPathTuple._2._2.get(); + String partitionPath = keyPathTuple._2._1.getPartitionPath(); + recordLocationPath = Optional + .of(new Path(new Path(metadata.getBasePath(), partitionPath), fileName) + .toUri().getPath()); + } else { + recordLocationPath = Optional.absent(); + } + return new Tuple2<>(keyPathTuple._2._1, recordLocationPath); + } + }); + } + + /** + * Lookup the location for each record key and return the pair for all + * record keys already present and drop the record keys if not present + * + * @param partitionRecordKeyPairRDD + * @param metadata + * @return + */ + private JavaPairRDD lookupIndex( + JavaPairRDD partitionRecordKeyPairRDD, final HoodieTableMetadata metadata) { + // Obtain records per partition, in the incoming records + Map recordsPerPartition = partitionRecordKeyPairRDD.countByKey(); + List affectedPartitionPathList = new ArrayList<>(recordsPerPartition.keySet()); + + // Step 2: Load all involved files as pairs + JavaPairRDD partitionFilePairRDD = + loadInvolvedFiles(affectedPartitionPathList, metadata); + Map filesPerPartition = partitionFilePairRDD.countByKey(); + + // Compute total subpartitions, to split partitions into. + Map subpartitionCountMap = + computeSubPartitions(recordsPerPartition, filesPerPartition); + + // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it. + return findMatchingFilesForRecordKeys(partitionFilePairRDD, partitionRecordKeyPairRDD, + subpartitionCountMap); + } + + /** + * The index lookup can be skewed in three dimensions : #files, #partitions, #records + * + * To be able to smoothly handle skews, we need to compute how to split each partitions + * into subpartitions. We do it here, in a way that keeps the amount of each Spark join + * partition to < 2GB. + * + * @param recordsPerPartition + * @param filesPerPartition + * @return + */ + private Map computeSubPartitions(Map recordsPerPartition, Map filesPerPartition) { + Map subpartitionCountMap = new HashMap<>(); + long totalRecords = 0; + long totalFiles = 0; + + for (String partitionPath : recordsPerPartition.keySet()) { + long numRecords = (Long) recordsPerPartition.get(partitionPath); + long numFiles = filesPerPartition.containsKey(partitionPath) ? (Long) filesPerPartition.get(partitionPath) : 1L; + subpartitionCountMap.put(partitionPath, ((numFiles * numRecords) / MAX_ITEMS_PER_JOIN_PARTITION) + 1); + + totalFiles += filesPerPartition.containsKey(partitionPath) ? (Long) filesPerPartition.get(partitionPath) : 0L; + totalRecords += numRecords; + } + logger.info("TotalRecords: " + totalRecords + ", TotalFiles: " + totalFiles + ", TotalAffectedPartitions:" + recordsPerPartition.size()); + logger.info("Sub Partition Counts : " + subpartitionCountMap); + return subpartitionCountMap; + } + + /** + * Load the input records as in memory. + */ + @VisibleForTesting + Map> getPartitionToRowKeys(JavaRDD> recordRDD) { + // Have to wrap the map into a hashmap becuase of the need to braoadcast (see: http://php.sabscape.com/blog/?p=671) + return recordRDD.mapToPair(new PairFunction, String, String>() { + @Override + public Tuple2 call(HoodieRecord record) { + return new Tuple2<>(record.getPartitionPath(), record.getRecordKey()); + } + }).groupByKey().collectAsMap(); + } + + /** + * Load all involved files as pair RDD. + */ + @VisibleForTesting + JavaPairRDD loadInvolvedFiles(List partitions, final HoodieTableMetadata metadata) { + return jsc.parallelize(partitions, Math.max(partitions.size(), 1)) + .flatMapToPair(new PairFlatMapFunction() { + @Override + public Iterable> call(String partitionPath) { + FileSystem fs = FSUtils.getFs(); + String latestCommitTime = metadata.getAllCommits().lastCommit(); + FileStatus[] filteredStatus = metadata.getLatestVersionInPartition(fs, partitionPath, latestCommitTime); + List> list = new ArrayList<>(); + for (FileStatus fileStatus : filteredStatus) { + list.add(new Tuple2<>(partitionPath, fileStatus.getPath().getName())); + } + return list; + } + }); + } + + @Override + public boolean rollbackCommit(String commitTime) { + // Nope, don't need to do anything. + return true; + } + + + /** + * When we subpartition records going into a partition, we still need to check them against + * all the files within the partition. Thus, we need to explode the (partition, file) pairs + * to (partition_subpartnum, file), so we can later join. + * + * + * @param partitionFilePairRDD + * @param subpartitionCountMap + * @return + */ + private JavaPairRDD explodePartitionFilePairRDD(JavaPairRDD partitionFilePairRDD, + final Map subpartitionCountMap) { + return partitionFilePairRDD + .map(new Function, List>>() { + @Override + public List> call(Tuple2 partitionFilePair) throws Exception { + List> explodedPartitionFilePairs = new ArrayList<>(); + for (long l = 0; l < subpartitionCountMap.get(partitionFilePair._1); l++) { + explodedPartitionFilePairs.add(new Tuple2<>( + String.format("%s#%d", partitionFilePair._1, l), + partitionFilePair._2)); + } + return explodedPartitionFilePairs; + } + }) + .flatMapToPair(new PairFlatMapFunction>, String, String>() { + @Override + public Iterable> call(List> exploded) throws Exception { + return exploded; + } + }); + + } + + /** + * To handle tons of incoming records to a partition, we need to split them into groups or create subpartitions. + * Here, we do a simple hash mod splitting, based on computed sub partitions. + * + * @param partitionRecordKeyPairRDD + * @param subpartitionCountMap + * @return + */ + private JavaPairRDD splitPartitionRecordKeysPairRDD(JavaPairRDD partitionRecordKeyPairRDD, + final Map subpartitionCountMap) { + return partitionRecordKeyPairRDD + .mapToPair(new PairFunction, String, String>() { + @Override + public Tuple2 call(Tuple2 partitionRecordKeyPair) throws Exception { + long subpart = Math.abs(partitionRecordKeyPair._2.hashCode()) % subpartitionCountMap.get(partitionRecordKeyPair._1); + return new Tuple2<>( + String.format("%s#%d", partitionRecordKeyPair._1, subpart), + partitionRecordKeyPair._2); + } + }); + } + + + /** + * Its crucial to pick the right parallelism. + * + * totalSubPartitions : this is deemed safe limit, to be nice with Spark. + * inputParallelism : typically number of input files. + * + * We pick the max such that, we are always safe, but go higher if say a there are + * a lot of input files. (otherwise, we will fallback to number of partitions in input and + * end up with slow performance) + * + * + * @param inputParallelism + * @param subpartitionCountMap + * @return + */ + private int determineParallelism(int inputParallelism, final Map subpartitionCountMap) { + // size the join parallelism to max(total number of sub partitions, total number of files). + int totalSubparts = 0; + for (long subparts : subpartitionCountMap.values()) { + totalSubparts += (int) subparts; + } + int joinParallelism = Math.max(totalSubparts, inputParallelism); + logger.info("InputParallelism: ${" + inputParallelism + "}, " + + "TotalSubParts: ${" + totalSubparts + "}, " + + "Join Parallelism set to : " + joinParallelism); + return joinParallelism; + } + + + /** + * Find out pair. All workload grouped by file-level. + * + * // Join PairRDD(PartitionPath, RecordKey) and PairRDD(PartitionPath, File) & then repartition such that + // each RDD partition is a file, then for each file, we do (1) load bloom filter, (2) load rowKeys, (3) Tag rowKey + // Make sure the parallelism is atleast the groupby parallelism for tagging location + */ + private JavaPairRDD findMatchingFilesForRecordKeys(JavaPairRDD partitionFilePairRDD, + JavaPairRDD partitionRecordKeyPairRDD, + final Map subpartitionCountMap) { + + // prepare the two RDDs and their join parallelism + JavaPairRDD subpartitionFilePairRDD = explodePartitionFilePairRDD(partitionFilePairRDD, subpartitionCountMap); + JavaPairRDD subpartitionRecordKeyPairRDD = splitPartitionRecordKeysPairRDD(partitionRecordKeyPairRDD, + subpartitionCountMap); + int joinParallelism = determineParallelism(partitionRecordKeyPairRDD.partitions().size(), subpartitionCountMap); + + // Perform a join, to bring all the files in each subpartition ,together with the record keys to be tested against them + JavaPairRDD> joinedTripletRDD = subpartitionFilePairRDD.join(subpartitionRecordKeyPairRDD, joinParallelism); + + // sort further based on filename, such that all checking for the file can happen within a single partition, on-the-fly + JavaPairRDD> fileSortedTripletRDD = joinedTripletRDD + .mapToPair(new PairFunction>, String, Tuple2>() { + @Override + /** + * Incoming triplet is (partitionPath_subpart) => (file, recordKey) + */ + public Tuple2> call(Tuple2> joinedTriplet) throws Exception { + String partitionPath = joinedTriplet._1.split("#")[0]; // throw away the subpart + String fileName = joinedTriplet._2._1; + String recordKey = joinedTriplet._2._2; + + // make a sort key as #, to handle skews + return new Tuple2<>(String.format("%s#%s", fileName, recordKey), + new Tuple2<>(fileName, new HoodieKey(recordKey, partitionPath))); + } + }).sortByKey(true, joinParallelism); + + return fileSortedTripletRDD + .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true) + .flatMap(new FlatMapFunction, IndexLookupResult>() { + @Override + public Iterable call(List indexLookupResults) + throws Exception { + return indexLookupResults; + } + }).filter(new Function() { + @Override + public Boolean call(IndexLookupResult lookupResult) throws Exception { + return lookupResult.getMatchingRecordKeys().size() > 0; + } + }).flatMapToPair(new PairFlatMapFunction() { + @Override + public Iterable> call(IndexLookupResult lookupResult) + throws Exception { + List> vals = new ArrayList<>(); + for (String recordKey : lookupResult.getMatchingRecordKeys()) { + vals.add(new Tuple2<>(recordKey, lookupResult.getFileName())); + } + return vals; + } + }); + } + + /** + * Tag the back to the original HoodieRecord RDD. + */ + private JavaRDD> tagLocationBacktoRecords(JavaPairRDD rowKeyFilenamePairRDD, + JavaRDD> recordRDD) { + JavaPairRDD> rowKeyRecordPairRDD = recordRDD.mapToPair( + new PairFunction, String, HoodieRecord>() { + @Override + public Tuple2> call(HoodieRecord record) throws Exception { + return new Tuple2<>(record.getRecordKey(), record); + } + }); + + // Here as the recordRDD might have more data than rowKeyRDD (some rowKeys' fileId is null), so we do left outer join. + return rowKeyRecordPairRDD.leftOuterJoin(rowKeyFilenamePairRDD).values().map( + new Function, Optional>, HoodieRecord>() { + @Override + public HoodieRecord call(Tuple2, Optional> v1) throws Exception { + HoodieRecord record = v1._1(); + if (v1._2().isPresent()) { + String filename = v1._2().get(); + if (filename != null && !filename.isEmpty()) { + record.setCurrentLocation(new HoodieRecordLocation(FSUtils.getCommitTime(filename), + FSUtils.getFileId(filename))); + } + } + return record; + } + }); + } + + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, HoodieTableMetadata metadata) { + return writeStatusRDD; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndexCheckFunction.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndexCheckFunction.java new file mode 100644 index 000000000..28334f243 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieBloomIndexCheckFunction.java @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.uber.hoodie.common.BloomFilter; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.util.ParquetUtils; +import com.uber.hoodie.exception.HoodieException; +import com.uber.hoodie.exception.HoodieIndexException; +import com.uber.hoodie.func.LazyIterableIterator; + +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.function.Function2; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import scala.Tuple2; + +/** + * Function performing actual checking of RDD parition containing (fileId, hoodieKeys) against the + * actual files + */ +public class HoodieBloomIndexCheckFunction implements Function2>>, Iterator>> { + + private static Logger logger = LogManager.getLogger(HoodieBloomIndexCheckFunction.class); + + private final String basePath; + + public HoodieBloomIndexCheckFunction(String basePath) { + this.basePath = basePath; + } + + /** + * Given a list of row keys and one file, return only row keys existing in that file. + */ + public static List checkCandidatesAgainstFile(List candidateRecordKeys, Path filePath) throws HoodieIndexException { + List foundRecordKeys = new ArrayList<>(); + try { + // Load all rowKeys from the file, to double-confirm + if (!candidateRecordKeys.isEmpty()) { + Set fileRowKeys = ParquetUtils.readRowKeysFromParquet(filePath); + logger.info("Loading " + fileRowKeys.size() + " row keys from " + filePath); + if (logger.isDebugEnabled()) { + logger.debug("Keys from " + filePath + " => " + fileRowKeys); + } + for (String rowKey : candidateRecordKeys) { + if (fileRowKeys.contains(rowKey)) { + foundRecordKeys.add(rowKey); + } + } + logger.info("After checking with row keys, we have " + foundRecordKeys.size() + " results, for file " + filePath + " => " + foundRecordKeys); + if (logger.isDebugEnabled()) { + logger.debug("Keys matching for file " + filePath + " => " + foundRecordKeys); + } + } + } catch (Exception e){ + throw new HoodieIndexException("Error checking candidate keys against file.", e); + } + return foundRecordKeys; + } + + class LazyKeyCheckIterator extends LazyIterableIterator>, List> { + + private List candidateRecordKeys; + + private BloomFilter bloomFilter; + + private String currentFile; + + private String currentParitionPath; + + LazyKeyCheckIterator(Iterator>> fileParitionRecordKeyTripletItr) { + super(fileParitionRecordKeyTripletItr); + currentFile = null; + candidateRecordKeys = new ArrayList<>(); + bloomFilter = null; + currentParitionPath = null; + } + + @Override + protected void start() { + } + + private void initState(String fileName, String partitionPath) throws HoodieIndexException { + try { + Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName); + bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(filePath); + candidateRecordKeys = new ArrayList<>(); + currentFile = fileName; + currentParitionPath = partitionPath; + } catch (Exception e) { + throw new HoodieIndexException("Error checking candidate keys against file.", e); + } + } + + @Override + protected List computeNext() { + + List ret = new ArrayList<>(); + try { + // process one file in each go. + while (inputItr.hasNext()) { + + Tuple2> currentTuple = inputItr.next(); + String fileName = currentTuple._2._1; + String partitionPath = currentTuple._2._2.getPartitionPath(); + String recordKey = currentTuple._2._2.getRecordKey(); + + // lazily init state + if (currentFile == null) { + initState(fileName, partitionPath); + } + + // if continue on current file) + if (fileName.equals(currentFile)) { + // check record key against bloom filter of current file & add to possible keys if needed + if (bloomFilter.mightContain(recordKey)) { + if (logger.isDebugEnabled()) { + logger.debug("#1 Adding " + recordKey + " as candidate for file " + fileName); + } + candidateRecordKeys.add(recordKey); + } + } else { + // do the actual checking of file & break out + Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); + logger.info("#1 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath); + if (logger.isDebugEnabled()) { + logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); + } + ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath))); + + initState(fileName, partitionPath); + if (bloomFilter.mightContain(recordKey)) { + if (logger.isDebugEnabled()) { + logger.debug("#2 Adding " + recordKey + " as candidate for file " + fileName); + } + candidateRecordKeys.add(recordKey); + } + break; + } + } + + // handle case, where we ran out of input, finish pending work, update return val + if (!inputItr.hasNext()) { + Path filePath = new Path(basePath + "/" + currentParitionPath + "/" + currentFile); + logger.info("#2 After bloom filter, the candidate row keys is reduced to " + candidateRecordKeys.size() + " for " + filePath); + if (logger.isDebugEnabled()) { + logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); + } + ret.add(new IndexLookupResult(currentFile, checkCandidatesAgainstFile(candidateRecordKeys, filePath))); + } + + } catch (Throwable e) { + if (e instanceof HoodieException) { + throw e; + } + throw new HoodieIndexException("Error checking bloom filter index. ", e); + } + + return ret; + } + + @Override + protected void end() { + } + } + + + @Override + public Iterator> call(Integer partition, + Iterator>> fileParitionRecordKeyTripletItr) throws Exception { + return new LazyKeyCheckIterator(fileParitionRecordKeyTripletItr); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java new file mode 100644 index 000000000..1df62ca4e --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/HoodieIndex.java @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.google.common.base.Optional; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieRecord; + +import com.uber.hoodie.exception.HoodieIndexException; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.io.Serializable; + +/** + * Base class for different types of indexes to determine the mapping from uuid + *

+ * TODO(vc): need methods for recovery and rollback + */ +public abstract class HoodieIndex implements Serializable { + protected transient JavaSparkContext jsc = null; + + public enum IndexType { + HBASE, + INMEMORY, + BLOOM + } + + protected final HoodieWriteConfig config; + + protected HoodieIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + this.config = config; + this.jsc = jsc; + } + + /** + * Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional] + * If the optional FullFilePath value is not present, then the key is not found. If the FullFilePath + * value is present, it is the path component (without scheme) of the URI underlying file + * + * @param hoodieKeys + * @param metadata + * @return + */ + public abstract JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, final HoodieTableMetadata metadata); + + /** + * Looks up the index and tags each incoming record with a location of a file that contains the + * row (if it is actually present) + */ + public abstract JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTableMetadata metadata) throws + HoodieIndexException; + + /** + * Extracts the location of written records, and updates the index. + *

+ * TODO(vc): We may need to propagate the record as well in a WriteStatus class + */ + public abstract JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTableMetadata metadata) throws + HoodieIndexException; + + /** + * Rollback the efffects of the commit made at commitTime. + */ + public abstract boolean rollbackCommit(String commitTime); + + public static HoodieIndex createIndex( + HoodieWriteConfig config, JavaSparkContext jsc) throws HoodieIndexException { + switch (config.getIndexType()) { + case HBASE: + return new HBaseIndex<>(config, jsc); + case INMEMORY: + return new InMemoryHashIndex<>(config, jsc); + case BLOOM: + return new HoodieBloomIndex<>(config, jsc); + } + throw new HoodieIndexException("Index type unspecified, set " + config.getIndexType()); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java new file mode 100644 index 000000000..775aaf9fb --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/InMemoryHashIndex.java @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.google.common.base.Optional; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.Function2; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + + +/** + * Hoodie Index implementation backed by an in-memory Hash map. + * + * ONLY USE FOR LOCAL TESTING + * + */ +public class InMemoryHashIndex extends HoodieIndex { + + private static ConcurrentMap recordLocationMap; + + public InMemoryHashIndex(HoodieWriteConfig config, JavaSparkContext jsc) { + super(config, jsc); + recordLocationMap = new ConcurrentHashMap<>(); + } + + @Override + public JavaPairRDD> fetchRecordLocation( + JavaRDD hoodieKeys, final HoodieTableMetadata metadata) { + throw new UnsupportedOperationException("InMemory index does not implement check exist yet"); + } + + /** + * Function that tags each HoodieRecord with an existing location, if known. + */ + class LocationTagFunction + implements Function2>, Iterator>> { + @Override + public Iterator> call(Integer partitionNum, + Iterator> hoodieRecordIterator) { + List> taggedRecords = new ArrayList<>(); + while (hoodieRecordIterator.hasNext()) { + HoodieRecord rec = hoodieRecordIterator.next(); + if (recordLocationMap.containsKey(rec.getKey())) { + rec.setCurrentLocation(recordLocationMap.get(rec.getKey())); + } + taggedRecords.add(rec); + } + return taggedRecords.iterator(); + } + } + + @Override + public JavaRDD> tagLocation(JavaRDD> recordRDD, + HoodieTableMetadata metadata) { + return recordRDD.mapPartitionsWithIndex(this.new LocationTagFunction(), true); + } + + @Override + public JavaRDD updateLocation(JavaRDD writeStatusRDD, + HoodieTableMetadata metadata) { + return writeStatusRDD.map(new Function() { + @Override + public WriteStatus call(WriteStatus writeStatus) { + for (HoodieRecord record : writeStatus.getWrittenRecords()) { + if (!writeStatus.isErrored(record.getKey())) { + recordLocationMap.put(record.getKey(), record.getNewLocation()); + } + } + return writeStatus; + } + }); + } + + @Override + public boolean rollbackCommit(String commitTime) { + // TODO (weiy) + return true; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/IndexLookupResult.java b/hoodie-client/src/main/java/com/uber/hoodie/index/IndexLookupResult.java new file mode 100644 index 000000000..7f9666d78 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/IndexLookupResult.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import java.util.List; + +/** + * Encapsulates the result from an index lookup + */ +public class IndexLookupResult { + + private String fileName; + + + private List matchingRecordKeys; + + public IndexLookupResult(String fileName, List matchingRecordKeys) { + this.fileName = fileName; + this.matchingRecordKeys = matchingRecordKeys; + } + + public String getFileName() { + return fileName; + } + + public List getMatchingRecordKeys() { + return matchingRecordKeys; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java new file mode 100644 index 000000000..e2dd4d77f --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleaner.java @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.common.model.HoodieCommits; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * Cleaner is responsible for garbage collecting older files in a given partition path, such that + * + * 1) It provides sufficient time for existing queries running on older versions, to finish + * + * 2) It bounds the growth of the files in the file system + * + * TODO: Should all cleaning be done based on {@link com.uber.hoodie.common.model.HoodieCommitMetadata} + * + * + */ +public class HoodieCleaner { + + public enum CleaningPolicy { + KEEP_LATEST_FILE_VERSIONS, + KEEP_LATEST_COMMITS + } + + + private static Logger logger = LogManager.getLogger(HoodieCleaner.class); + + + private HoodieTableMetadata metadata; + + private HoodieWriteConfig config; + + private FileSystem fs; + + public HoodieCleaner(HoodieTableMetadata metadata, + HoodieWriteConfig config, + FileSystem fs) { + this.metadata = metadata; + this.config = config; + this.fs = fs; + } + + + /** + * + * Selects the older versions of files for cleaning, such that it bounds the number of versions of each file. + * This policy is useful, if you are simply interested in querying the table, and you don't want too many + * versions for a single file (i.e run it with versionsRetained = 1) + * + * + * @param partitionPath + * @return + * @throws IOException + */ + private List getFilesToCleanKeepingLatestVersions(String partitionPath) throws IOException { + logger.info("Cleaning "+ partitionPath+", retaining latest "+ config.getCleanerFileVersionsRetained()+" file versions. "); + Map> fileVersions = metadata.getAllVersionsInPartition(fs, partitionPath); + List deletePaths = new ArrayList<>(); + + for (String file : fileVersions.keySet()) { + List commitList = fileVersions.get(file); + int keepVersions = config.getCleanerFileVersionsRetained(); + Iterator commitItr = commitList.iterator(); + while (commitItr.hasNext() && keepVersions > 0) { + // Skip this most recent version + commitItr.next(); + keepVersions--; + } + // Delete the remaining files + while (commitItr.hasNext()) { + deletePaths.add(String.format("%s/%s/%s", + config.getBasePath(), + partitionPath, + commitItr.next().getPath().getName())); + } + } + return deletePaths; + } + + + /** + * Selects the versions for file for cleaning, such that it + * + * - Leaves the latest version of the file untouched + * - For older versions, + * - It leaves all the commits untouched which has occured in last config.getCleanerCommitsRetained() commits + * - It leaves ONE commit before this window. We assume that the max(query execution time) == commit_batch_time * config.getCleanerCommitsRetained(). This is 12 hours by default. + * This is essential to leave the file used by the query thats running for the max time. + * + * This provides the effect of having lookback into all changes that happened in the last X + * commits. (eg: if you retain 24 commits, and commit batch time is 30 mins, then you have 12 hrs of lookback) + * + * This policy is the default. + * + * @param partitionPath + * @return + * @throws IOException + */ + private List getFilesToCleanKeepingLatestCommits(String partitionPath) + throws IOException { + int commitsRetained = config.getCleanerCommitsRetained(); + logger.info( + "Cleaning " + partitionPath + ", retaining latest " + commitsRetained + " commits. "); + List deletePaths = new ArrayList<>(); + + // determine if we have enough commits, to start cleaning. + HoodieCommits commits = metadata.getAllCommits(); + if (commits.getNumCommits() > commitsRetained) { + String earliestCommitToRetain = + commits.nthCommit(commits.getNumCommits() - commitsRetained); + Map> fileVersions = + metadata.getAllVersionsInPartition(fs, partitionPath); + for (String file : fileVersions.keySet()) { + List fileList = fileVersions.get(file); + String lastVersion = FSUtils.getCommitTime(fileList.get(0).getPath().getName()); + String lastVersionBeforeEarliestCommitToRetain = + getLatestVersionBeforeCommit(fileList, earliestCommitToRetain); + + // Ensure there are more than 1 version of the file (we only clean old files from updates) + // i.e always spare the last commit. + for (FileStatus afile : fileList) { + String fileCommitTime = FSUtils.getCommitTime(afile.getPath().getName()); + // Dont delete the latest commit and also the last commit before the earliest commit we are retaining + // The window of commit retain == max query run time. So a query could be running which still + // uses this file. + if (fileCommitTime.equals(lastVersion) || ( + lastVersionBeforeEarliestCommitToRetain != null && fileCommitTime + .equals(lastVersionBeforeEarliestCommitToRetain))) { + // move on to the next file + continue; + } + + // Always keep the last commit + if (HoodieCommits.isCommit1After(earliestCommitToRetain, fileCommitTime)) { + // this is a commit, that should be cleaned. + deletePaths.add(String + .format("%s/%s/%s", config.getBasePath(), partitionPath, + FSUtils.maskWithoutTaskPartitionId(fileCommitTime, file))); + } + } + } + } + + return deletePaths; + } + + /** + * Gets the latest version < commitTime. This version file could still be used by queries. + */ + private String getLatestVersionBeforeCommit(List fileList, String commitTime) { + for (FileStatus file : fileList) { + String fileCommitTime = FSUtils.getCommitTime(file.getPath().getName()); + if (HoodieCommits.isCommit1After(commitTime, fileCommitTime)) { + // fileList is sorted on the reverse, so the first commit we find <= commitTime is the one we want + return fileCommitTime; + } + } + // There is no version of this file which is <= commitTime + return null; + } + + + /** + * Performs cleaning of the partition path according to cleaning policy and returns the number + * of files cleaned. + * + * @throws IllegalArgumentException if unknown cleaning policy is provided + */ + public int clean(String partitionPath) throws IOException { + CleaningPolicy policy = config.getCleanerPolicy(); + List deletePaths; + if (policy == CleaningPolicy.KEEP_LATEST_COMMITS) { + deletePaths = getFilesToCleanKeepingLatestCommits(partitionPath); + } else if (policy == CleaningPolicy.KEEP_LATEST_FILE_VERSIONS) { + deletePaths = getFilesToCleanKeepingLatestVersions(partitionPath); + } else { + throw new IllegalArgumentException("Unknown cleaning policy : " + policy.name()); + } + + // perform the actual deletes + for (String deletePath : deletePaths) { + logger.info("Working on delete path :" + deletePath); + FileStatus[] deleteVersions = fs.globStatus(new Path(deletePath)); + if (deleteVersions != null) { + for (FileStatus deleteVersion : deleteVersions) { + if (fs.delete(deleteVersion.getPath(), false)) { + logger.info("Cleaning file at path :" + deleteVersion.getPath()); + } + } + } + } + logger.info(deletePaths.size() + " files deleted for partition path:" + partitionPath); + return deletePaths.size(); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java new file mode 100644 index 000000000..679314e7a --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.common.file.HoodieAppendLog; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieCommitException; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Log to hold older historical commits, to bound the growth of .commit files + */ +public class HoodieCommitArchiveLog { + private static Logger log = LogManager.getLogger(HoodieCommitArchiveLog.class); + private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits.archived"; + + private final Path archiveFilePath; + private final FileSystem fs; + private final HoodieWriteConfig config; + + public HoodieCommitArchiveLog(HoodieWriteConfig config) { + this.archiveFilePath = + new Path(config.getBasePath(), + HoodieTableMetadata.METAFOLDER_NAME + "/" +HOODIE_COMMIT_ARCHIVE_LOG_FILE); + this.fs = FSUtils.getFs(); + this.config = config; + } + + /** + * Check if commits need to be archived. If yes, archive commits. + */ + public boolean archiveIfRequired() { + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, config.getBasePath()); + List commitsToArchive = getCommitsToArchive(metadata); + if (!commitsToArchive.isEmpty()) { + log.info("Archiving commits " + commitsToArchive); + archive(metadata, commitsToArchive); + return deleteCommits(metadata, commitsToArchive); + } else { + log.info("No Commits to archive"); + return true; + } + } + + private List getCommitsToArchive(HoodieTableMetadata metadata) { + int maxCommitsToKeep = config.getMaxCommitsToKeep(); + int minCommitsToKeep = config.getMinCommitsToKeep(); + + List commits = metadata.getAllCommits().getCommitList(); + List commitsToArchive = new ArrayList(); + if (commits.size() > maxCommitsToKeep) { + // Actually do the commits + commitsToArchive = commits.subList(0, commits.size() - minCommitsToKeep); + } + return commitsToArchive; + } + + private boolean deleteCommits(HoodieTableMetadata metadata, List commitsToArchive) { + log.info("Deleting commits " + commitsToArchive); + boolean success = true; + for(String commitToArchive:commitsToArchive) { + Path commitFile = + new Path(metadata.getBasePath() + "/" + + HoodieTableMetadata.METAFOLDER_NAME + "/" + + FSUtils.makeCommitFileName(commitToArchive)); + try { + if (fs.exists(commitFile)) { + success &= fs.delete(commitFile, false); + log.info("Archived and deleted commit file " + commitFile); + } + } catch (IOException e) { + throw new HoodieIOException( + "Failed to delete archived commit " + commitToArchive, e); + } + } + return success; + } + + private HoodieAppendLog.Writer openWriter() throws IOException { + log.info("Opening archive file at path: " + archiveFilePath); + return HoodieAppendLog + .createWriter(fs.getConf(), HoodieAppendLog.Writer.file(archiveFilePath), + HoodieAppendLog.Writer.keyClass(Text.class), + HoodieAppendLog.Writer.appendIfExists(true), + HoodieAppendLog.Writer.valueClass(Text.class), HoodieAppendLog.Writer + .compression(HoodieAppendLog.CompressionType.RECORD, new BZip2Codec())); + } + + private void archive(HoodieTableMetadata metadata, List commits) + throws HoodieCommitException { + HoodieAppendLog.Writer writer = null; + try { + writer = openWriter(); + for (String commitTime : commits) { + Text k = new Text(commitTime); + Text v = new Text(metadata.getCommitMetadata(commitTime).toJsonString()); + writer.append(k, v); + log.info("Wrote " + k); + } + } catch (IOException e) { + throw new HoodieCommitException("Could not archive commits " + commits, e); + } finally { + if (writer != null) { + try { + writer.hsync(); + writer.close(); + } catch (IOException e) { + throw new HoodieCommitException( + "Could not close the archive commits writer " + commits, e); + } + } + } + } + + public Path getArchiveFilePath() { + return archiveFilePath; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java new file mode 100644 index 000000000..28f3f86ea --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.HoodieAvroUtils; +import com.uber.hoodie.exception.HoodieIOException; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; + +public abstract class HoodieIOHandle { + private static Logger logger = LogManager.getLogger(HoodieIOHandle.class); + protected final String commitTime; + protected final HoodieWriteConfig config; + protected final FileSystem fs; + protected final HoodieTableMetadata metadata; + protected final Schema schema; + + public HoodieIOHandle(HoodieWriteConfig config, String commitTime, + HoodieTableMetadata metadata) { + this.commitTime = commitTime; + this.config = config; + this.fs = FSUtils.getFs(); + this.metadata = metadata; + this.schema = + HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); + } + + public Path makeNewPath(String partitionPath, int taskPartitionId, String fileName) { + Path path = new Path(config.getBasePath(), partitionPath); + try { + fs.mkdirs(path); // create a new partition as needed. + } catch (IOException e) { + throw new HoodieIOException("Failed to make dir " + path, e); + } + + return new Path(path.toString(), + FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName)); + } + + /** + * Deletes any new tmp files written during the current commit, into the partition + */ + public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config, + String commitTime, + String partitionPath, + int taskPartitionId) { + FileSystem fs = FSUtils.getFs(); + try { + FileStatus[] prevFailedFiles = fs.globStatus(new Path(String + .format("%s/%s/%s", config.getBasePath(), partitionPath, + FSUtils.maskWithoutFileId(commitTime, taskPartitionId)))); + if (prevFailedFiles != null) { + logger.info("Deleting " + prevFailedFiles.length + + " files generated by previous failed attempts."); + for (FileStatus status : prevFailedFiles) { + fs.delete(status.getPath(), false); + } + } + } catch (IOException e) { + throw new HoodieIOException("Failed to cleanup Temp files from commit " + commitTime, + e); + } + } + + public Schema getSchema() { + return schema; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java new file mode 100644 index 000000000..9b2ac3b6d --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieInsertHandle.java @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieInsertException; +import com.uber.hoodie.io.storage.HoodieStorageWriter; +import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.TaskContext; + +import java.io.IOException; +import java.util.UUID; + +public class HoodieInsertHandle extends HoodieIOHandle { + private static Logger logger = LogManager.getLogger(HoodieInsertHandle.class); + + private final WriteStatus status; + private final HoodieStorageWriter storageWriter; + private final Path path; + private int recordsWritten = 0; + + public HoodieInsertHandle(HoodieWriteConfig config, String commitTime, + HoodieTableMetadata metadata, String partitionPath) { + super(config, commitTime, metadata); + this.status = new WriteStatus(); + status.setFileId(UUID.randomUUID().toString()); + status.setPartitionPath(partitionPath); + + this.path = makeNewPath(partitionPath, TaskContext.getPartitionId(), status.getFileId()); + try { + this.storageWriter = + HoodieStorageWriterFactory.getStorageWriter(commitTime, path, metadata, config, schema); + } catch (IOException e) { + throw new HoodieInsertException( + "Failed to initialize HoodieStorageWriter for path " + path, e); + } + logger.info("New InsertHandle for partition :" + partitionPath); + } + + /** + * Determines whether we can accept the incoming records, into the current file, depending on + *

+ * - Whether it belongs to the same partitionPath as existing records + * - Whether the current file written bytes < max file size + * + * @return + */ + public boolean canWrite(HoodieRecord record) { + return storageWriter.canWrite() && record.getPartitionPath() + .equals(status.getPartitionPath()); + } + + /** + * Perform the actual writing of the given record into the backing file. + * + * @param record + */ + public void write(HoodieRecord record) { + try { + IndexedRecord avroRecord = record.getData().getInsertValue(schema); + storageWriter.writeAvroWithMetadata(avroRecord, record); + status.markSuccess(record); + // update the new location of record, so we know where to find it next + record.setNewLocation(new HoodieRecordLocation(commitTime, status.getFileId())); + record.deflate(); + recordsWritten++; + } catch (Throwable t) { + status.markFailure(record, t); + logger.error("Error writing record " + record, t); + } + } + + /** + * Performs actions to durably, persist the current changes and returns a WriteStatus object + * + * @return + */ + public WriteStatus close() { + logger.info( + "Closing the file " + status.getFileId() + " as we are done with all the records " + + recordsWritten); + try { + storageWriter.close(); + + HoodieWriteStat stat = new HoodieWriteStat(); + stat.setNumWrites(recordsWritten); + stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT); + stat.setFileId(status.getFileId()); + stat.setFullPath(path.toString()); + stat.setTotalWriteBytes(FSUtils.getFileSize(fs, path)); + stat.setTotalWriteErrors(status.getFailedRecords().size()); + status.setStat(stat); + + return status; + } catch (IOException e) { + throw new HoodieInsertException("Failed to close the Insert Handle for path " + path, + e); + } + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java new file mode 100644 index 000000000..43394c326 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieUpdateHandle.java @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieUpsertException; +import com.uber.hoodie.io.storage.HoodieStorageWriter; +import com.uber.hoodie.io.storage.HoodieStorageWriterFactory; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.TaskContext; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; + +@SuppressWarnings("Duplicates") public class HoodieUpdateHandle extends HoodieIOHandle { + private static Logger logger = LogManager.getLogger(HoodieUpdateHandle.class); + + private final WriteStatus writeStatus; + private final HashMap> keyToNewRecords; + private HoodieStorageWriter storageWriter; + private Path newFilePath; + private Path oldFilePath; + private long recordsWritten = 0; + private long updatedRecordsWritten = 0; + private String fileId; + + public HoodieUpdateHandle(HoodieWriteConfig config, + String commitTime, + HoodieTableMetadata metadata, + Iterator> recordItr, + String fileId) { + super(config, commitTime, metadata); + WriteStatus writeStatus = new WriteStatus(); + writeStatus.setStat(new HoodieWriteStat()); + this.writeStatus = writeStatus; + this.fileId = fileId; + this.keyToNewRecords = new HashMap<>(); + init(recordItr); + } + + /** + * Load the new incoming records in a map, and extract the old file path. + */ + private void init(Iterator> newRecordsItr) { + try { + // Load the new records in a map + while (newRecordsItr.hasNext()) { + HoodieRecord record = newRecordsItr.next(); + // If the first record, we need to extract some info out + if (oldFilePath == null) { + String latestValidFilePath = metadata.getFilenameForRecord(fs, record, fileId); + writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath)); + oldFilePath = new Path( + config.getBasePath() + "/" + record.getPartitionPath() + "/" + + latestValidFilePath); + newFilePath = new Path( + config.getBasePath() + "/" + record.getPartitionPath() + "/" + FSUtils + .makeDataFileName(commitTime, TaskContext.getPartitionId(), fileId)); + + // handle cases of partial failures, for update task + if (fs.exists(newFilePath)) { + fs.delete(newFilePath, false); + } + + logger.info(String.format("Merging new data into oldPath %s, as newPath %s", + oldFilePath.toString(), newFilePath.toString())); + // file name is same for all records, in this bunch + writeStatus.setFileId(fileId); + writeStatus.setPartitionPath(record.getPartitionPath()); + writeStatus.getStat().setFileId(fileId); + writeStatus.getStat().setFullPath(newFilePath.toString()); + } + keyToNewRecords.put(record.getRecordKey(), record); + // update the new location of the record, so we know where to find it next + record.setNewLocation(new HoodieRecordLocation(commitTime, fileId)); + } + // Create the writer for writing the new version file + storageWriter = HoodieStorageWriterFactory + .getStorageWriter(commitTime, newFilePath, metadata, config, schema); + + } catch (Exception e) { + logger.error("Error in update task at commit " + commitTime, e); + writeStatus.setGlobalError(e); + } + } + + + private void writeUpdateRecord(HoodieRecord hoodieRecord, IndexedRecord indexedRecord) { + try { + storageWriter.writeAvroWithMetadata(indexedRecord, hoodieRecord); + hoodieRecord.deflate(); + writeStatus.markSuccess(hoodieRecord); + recordsWritten ++; + updatedRecordsWritten ++; + } catch (Exception e) { + logger.error("Error writing record "+ hoodieRecord, e); + writeStatus.markFailure(hoodieRecord, e); + } + } + + /** + * Go through an old record. Here if we detect a newer version shows up, we write the new one to the file. + */ + public void write(GenericRecord oldRecord) { + String key = oldRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + HoodieRecord hoodieRecord = keyToNewRecords.get(key); + if (keyToNewRecords.containsKey(key)) { + try { + IndexedRecord avroRecord = hoodieRecord.getData().combineAndGetUpdateValue(oldRecord, schema); + writeUpdateRecord(hoodieRecord, avroRecord); + keyToNewRecords.remove(key); + } catch (Exception e) { + throw new HoodieUpsertException("Failed to combine/merge new record with old value in storage, for new record {" + + keyToNewRecords.get(key) + "}, old value {" + oldRecord + "}", e); + } + } else { + // this should work as it is, since this is an existing record + String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + + getOldFilePath() + " to new file " + newFilePath; + try { + storageWriter.writeAvro(key, oldRecord); + } catch (ClassCastException e) { + logger.error( + "Schema mismatch when rewriting old record " + oldRecord + " from file " + + getOldFilePath() + " to file " + newFilePath + " with schema " + schema + .toString(true)); + throw new HoodieUpsertException(errMsg, e); + } catch (IOException e) { + logger.error("Failed to merge old record into new file for key " + key + " from old file " + + getOldFilePath() + " to new file " + newFilePath, e); + throw new HoodieUpsertException(errMsg, e); + } + recordsWritten ++; + } + } + + public void close() { + try { + // write out any pending records (this can happen when inserts are turned into updates) + Iterator pendingRecordsItr = keyToNewRecords.keySet().iterator(); + while (pendingRecordsItr.hasNext()) { + String key = pendingRecordsItr.next(); + HoodieRecord hoodieRecord = keyToNewRecords.get(key); + writeUpdateRecord(hoodieRecord, hoodieRecord.getData().getInsertValue(schema)); + } + keyToNewRecords.clear(); + + if (storageWriter != null) { + storageWriter.close(); + } + writeStatus.getStat().setTotalWriteBytes(FSUtils.getFileSize(fs, newFilePath)); + writeStatus.getStat().setNumWrites(recordsWritten); + writeStatus.getStat().setNumUpdateWrites(updatedRecordsWritten); + writeStatus.getStat().setTotalWriteErrors(writeStatus.getFailedRecords().size()); + } catch (IOException e) { + throw new HoodieUpsertException("Failed to close UpdateHandle", e); + } + } + + public Path getOldFilePath() { + return oldFilePath; + } + + public WriteStatus getWriteStatus() { + return writeStatus; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java new file mode 100644 index 000000000..363bbb78f --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetConfig.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io.storage; + +import com.uber.hoodie.avro.HoodieAvroWriteSupport; +import org.apache.avro.Schema; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +public class HoodieParquetConfig { + private HoodieAvroWriteSupport writeSupport; + private CompressionCodecName compressionCodecName; + private int blockSize; + private int pageSize; + private int maxFileSize; + private Configuration hadoopConf; + + public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport, + CompressionCodecName compressionCodecName, int blockSize, int pageSize, int maxFileSize, + Configuration hadoopConf) { + this.writeSupport = writeSupport; + this.compressionCodecName = compressionCodecName; + this.blockSize = blockSize; + this.pageSize = pageSize; + this.maxFileSize = maxFileSize; + this.hadoopConf = hadoopConf; + } + + public HoodieAvroWriteSupport getWriteSupport() { + return writeSupport; + } + + public CompressionCodecName getCompressionCodecName() { + return compressionCodecName; + } + + public int getBlockSize() { + return blockSize; + } + + public int getPageSize() { + return pageSize; + } + + public int getMaxFileSize() { + return maxFileSize; + } + + public Configuration getHadoopConf() { + return hadoopConf; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java new file mode 100644 index 000000000..5d776c5ed --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io.storage; + +import com.uber.hoodie.avro.HoodieAvroWriteSupport; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.util.HoodieAvroUtils; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.spark.TaskContext; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +/** + * HoodieParquetWriter extends the ParquetWriter to help limit the size of underlying file. + * Provides a way to check if the current file can take more records with the canWrite() + * + * @param + */ +public class HoodieParquetWriter + extends ParquetWriter implements HoodieStorageWriter { + private static double STREAM_COMPRESSION_RATIO = 0.1; + private static AtomicLong recordIndex = new AtomicLong(1); + + private final Path file; + private final HoodieWrapperFileSystem fs; + private final long maxFileSize; + private final HoodieAvroWriteSupport writeSupport; + private final String commitTime; + private final Schema schema; + + + private static Configuration registerFileSystem(Configuration conf) { + Configuration returnConf = new Configuration(conf); + String scheme = FileSystem.getDefaultUri(conf).getScheme(); + returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl", + HoodieWrapperFileSystem.class.getName()); + return returnConf; + } + + public HoodieParquetWriter(String commitTime, Path file, + HoodieParquetConfig parquetConfig, Schema schema) throws IOException { + super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()), + ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), + parquetConfig.getCompressionCodecName(), parquetConfig.getBlockSize(), + parquetConfig.getPageSize(), parquetConfig.getPageSize(), + ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, + ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, + registerFileSystem(parquetConfig.getHadoopConf())); + this.file = + HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()); + this.fs = (HoodieWrapperFileSystem) this.file + .getFileSystem(registerFileSystem(parquetConfig.getHadoopConf())); + // We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10% + // TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS + this.maxFileSize = parquetConfig.getMaxFileSize() + Math + .round(parquetConfig.getMaxFileSize() * STREAM_COMPRESSION_RATIO); + this.writeSupport = parquetConfig.getWriteSupport(); + this.commitTime = commitTime; + this.schema = schema; + } + + + @Override + public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException { + String seqId = HoodieRecord.generateSequenceId(commitTime, TaskContext.getPartitionId(), + recordIndex.getAndIncrement()); + HoodieAvroUtils.addHoodieKeyToRecord((GenericRecord) avroRecord, + record.getRecordKey(), + record.getPartitionPath(), + file.getName()); + HoodieAvroUtils.addCommitMetadataToRecord((GenericRecord) avroRecord, commitTime, seqId); + super.write(avroRecord); + writeSupport.add(record.getRecordKey()); + } + + public boolean canWrite() { + return fs.getBytesWritten(file) < maxFileSize; + } + + @Override public void writeAvro(String key, IndexedRecord object) throws IOException { + super.write(object); + writeSupport.add(key); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java new file mode 100644 index 000000000..e4fcdc335 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriter.java @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io.storage; + +import com.uber.hoodie.common.model.HoodieRecord; +import org.apache.avro.generic.IndexedRecord; + +import java.io.IOException; + +public interface HoodieStorageWriter { + void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException; + boolean canWrite(); + void close() throws IOException; + void writeAvro(String key, R oldRecord) throws IOException; +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java new file mode 100644 index 000000000..c393b638f --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieStorageWriterFactory.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io.storage; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.avro.HoodieAvroWriteSupport; +import com.uber.hoodie.common.BloomFilter; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +import java.io.IOException; + +public class HoodieStorageWriterFactory { + public static HoodieStorageWriter getStorageWriter( + String commitTime, Path path, HoodieTableMetadata metadata, HoodieWriteConfig config, Schema schema) + throws IOException { + //TODO - based on the metadata choose the implementation of HoodieStorageWriter + // Currently only parquet is supported + return newParquetStorageWriter(commitTime, path, config, schema); + } + + private static HoodieStorageWriter newParquetStorageWriter( + String commitTime, Path path, HoodieWriteConfig config, Schema schema) throws IOException { + BloomFilter filter = + new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); + HoodieAvroWriteSupport writeSupport = + new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); + + HoodieParquetConfig parquetConfig = + new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, + config.getParquetBlockSize(), config.getParquetPageSize(), + config.getParquetMaxFileSize(), FSUtils.getFs().getConf()); + + return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java new file mode 100644 index 000000000..64034b4d2 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io.storage; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.fs.permission.AclEntry; +import org.apache.hadoop.fs.permission.AclStatus; +import org.apache.hadoop.fs.permission.FsAction; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hadoop.security.AccessControlException; +import org.apache.hadoop.security.Credentials; +import org.apache.hadoop.security.token.Token; +import org.apache.hadoop.util.Progressable; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; + +/** + * HoodieWrapperFileSystem wraps the default file system. + * It holds state about the open streams in the file system to support getting the + * written size to each of the open streams. + */ +public class HoodieWrapperFileSystem extends FileSystem { + private static final Set SUPPORT_SCHEMES; + public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; + + static { + SUPPORT_SCHEMES = new HashSet<>(2); + SUPPORT_SCHEMES.add("file"); + SUPPORT_SCHEMES.add("hdfs"); + } + + private ConcurrentMap openStreams = + new ConcurrentHashMap<>(); + private FileSystem fileSystem; + private URI uri; + + @Override public void initialize(URI uri, Configuration conf) throws IOException { + // Get the default filesystem to decorate + fileSystem = FileSystem.get(conf); + // Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get + // fileSystem.initialize(FileSystem.getDefaultUri(conf), conf); + // fileSystem.setConf(conf); + this.uri = uri; + } + + @Override public URI getUri() { + return uri; + } + + @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { + return fileSystem.open(convertToDefaultPath(f), bufferSize); + } + + @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, + int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException { + final Path translatedPath = convertToDefaultPath(f); + return wrapOutputStream(f, fileSystem + .create(translatedPath, permission, overwrite, bufferSize, replication, blockSize, + progress)); + } + + private FSDataOutputStream wrapOutputStream(final Path path, + FSDataOutputStream fsDataOutputStream) throws IOException { + if (fsDataOutputStream instanceof SizeAwareFSDataOutputStream) { + return fsDataOutputStream; + } + + SizeAwareFSDataOutputStream os = + new SizeAwareFSDataOutputStream(fsDataOutputStream, new Runnable() { + @Override public void run() { + openStreams.remove(path.getName()); + } + }); + openStreams.put(path.getName(), os); + return os; + } + + @Override public FSDataOutputStream create(Path f, boolean overwrite) throws IOException { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f), overwrite)); + } + + @Override public FSDataOutputStream create(Path f) throws IOException { + return wrapOutputStream(f, fileSystem.create(convertToDefaultPath(f))); + } + + @Override public FSDataOutputStream create(Path f, Progressable progress) throws IOException { + return fileSystem.create(convertToDefaultPath(f), progress); + } + + @Override public FSDataOutputStream create(Path f, short replication) throws IOException { + return fileSystem.create(convertToDefaultPath(f), replication); + } + + @Override public FSDataOutputStream create(Path f, short replication, Progressable progress) + throws IOException { + return fileSystem.create(convertToDefaultPath(f), replication, progress); + } + + @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize) + throws IOException { + return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize); + } + + @Override public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, + Progressable progress) throws IOException { + return fileSystem.create(convertToDefaultPath(f), overwrite, bufferSize, progress); + } + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, + long blockSize, Progressable progress) throws IOException { + return fileSystem + .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize, + progress); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, + int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException { + return fileSystem + .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, + progress); + } + + @Override + public FSDataOutputStream create(Path f, FsPermission permission, EnumSet flags, + int bufferSize, short replication, long blockSize, Progressable progress, + Options.ChecksumOpt checksumOpt) throws IOException { + return fileSystem + .create(convertToDefaultPath(f), permission, flags, bufferSize, replication, blockSize, + progress, checksumOpt); + } + + + @Override + public FSDataOutputStream create(Path f, boolean overwrite, int bufferSize, short replication, + long blockSize) throws IOException { + return fileSystem + .create(convertToDefaultPath(f), overwrite, bufferSize, replication, blockSize); + } + + + @Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) + throws IOException { + return fileSystem.append(convertToDefaultPath(f), bufferSize, progress); + } + + @Override public boolean rename(Path src, Path dst) throws IOException { + return fileSystem.rename(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override public boolean delete(Path f, boolean recursive) throws IOException { + return fileSystem.delete(convertToDefaultPath(f), recursive); + } + + @Override public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertToDefaultPath(f)); + } + + @Override public void setWorkingDirectory(Path new_dir) { + fileSystem.setWorkingDirectory(convertToDefaultPath(new_dir)); + } + + @Override public Path getWorkingDirectory() { + return convertToHoodiePath(fileSystem.getWorkingDirectory()); + } + + @Override public boolean mkdirs(Path f, FsPermission permission) throws IOException { + return fileSystem.mkdirs(convertToDefaultPath(f), permission); + } + + @Override public FileStatus getFileStatus(Path f) throws IOException { + return fileSystem.getFileStatus(convertToDefaultPath(f)); + } + + @Override public String getScheme() { + return uri.getScheme(); + } + + @Override public String getCanonicalServiceName() { + return fileSystem.getCanonicalServiceName(); + } + + @Override public String getName() { + return fileSystem.getName(); + } + + @Override public Path makeQualified(Path path) { + return convertToHoodiePath(fileSystem.makeQualified(convertToDefaultPath(path))); + } + + @Override public Token getDelegationToken(String renewer) throws IOException { + return fileSystem.getDelegationToken(renewer); + } + + @Override public Token[] addDelegationTokens(String renewer, Credentials credentials) + throws IOException { + return fileSystem.addDelegationTokens(renewer, credentials); + } + + @Override public FileSystem[] getChildFileSystems() { + return fileSystem.getChildFileSystems(); + } + + @Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) + throws IOException { + return fileSystem.getFileBlockLocations(file, start, len); + } + + @Override public BlockLocation[] getFileBlockLocations(Path p, long start, long len) + throws IOException { + return fileSystem.getFileBlockLocations(convertToDefaultPath(p), start, len); + } + + @Override public FsServerDefaults getServerDefaults() throws IOException { + return fileSystem.getServerDefaults(); + } + + @Override public FsServerDefaults getServerDefaults(Path p) throws IOException { + return fileSystem.getServerDefaults(convertToDefaultPath(p)); + } + + @Override public Path resolvePath(Path p) throws IOException { + return convertToHoodiePath(fileSystem.resolvePath(convertToDefaultPath(p))); + } + + @Override public FSDataInputStream open(Path f) throws IOException { + return fileSystem.open(convertToDefaultPath(f)); + } + + @Override + public FSDataOutputStream createNonRecursive(Path f, boolean overwrite, int bufferSize, + short replication, long blockSize, Progressable progress) throws IOException { + return fileSystem + .createNonRecursive(convertToDefaultPath(f), overwrite, bufferSize, replication, + blockSize, progress); + } + + @Override + public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, boolean overwrite, + int bufferSize, short replication, long blockSize, Progressable progress) + throws IOException { + return fileSystem + .createNonRecursive(convertToDefaultPath(f), permission, overwrite, bufferSize, + replication, blockSize, progress); + } + + @Override public FSDataOutputStream createNonRecursive(Path f, FsPermission permission, + EnumSet flags, int bufferSize, short replication, long blockSize, + Progressable progress) throws IOException { + return fileSystem + .createNonRecursive(convertToDefaultPath(f), permission, flags, bufferSize, replication, + blockSize, progress); + } + + @Override public boolean createNewFile(Path f) throws IOException { + return fileSystem.createNewFile(convertToDefaultPath(f)); + } + + @Override public FSDataOutputStream append(Path f) throws IOException { + return fileSystem.append(convertToDefaultPath(f)); + } + + @Override public FSDataOutputStream append(Path f, int bufferSize) throws IOException { + return fileSystem.append(convertToDefaultPath(f), bufferSize); + } + + @Override public void concat(Path trg, Path[] psrcs) throws IOException { + Path[] psrcsNew = convertDefaults(psrcs); + fileSystem.concat(convertToDefaultPath(trg), psrcsNew); + } + + @Override public short getReplication(Path src) throws IOException { + return fileSystem.getReplication(convertToDefaultPath(src)); + } + + @Override public boolean setReplication(Path src, short replication) throws IOException { + return fileSystem.setReplication(convertToDefaultPath(src), replication); + } + + @Override public boolean delete(Path f) throws IOException { + return fileSystem.delete(convertToDefaultPath(f)); + } + + @Override public boolean deleteOnExit(Path f) throws IOException { + return fileSystem.deleteOnExit(convertToDefaultPath(f)); + } + + @Override public boolean cancelDeleteOnExit(Path f) { + return fileSystem.cancelDeleteOnExit(convertToDefaultPath(f)); + } + + @Override public boolean exists(Path f) throws IOException { + return fileSystem.exists(convertToDefaultPath(f)); + } + + @Override public boolean isDirectory(Path f) throws IOException { + return fileSystem.isDirectory(convertToDefaultPath(f)); + } + + @Override public boolean isFile(Path f) throws IOException { + return fileSystem.isFile(convertToDefaultPath(f)); + } + + @Override public long getLength(Path f) throws IOException { + return fileSystem.getLength(convertToDefaultPath(f)); + } + + @Override public ContentSummary getContentSummary(Path f) throws IOException { + return fileSystem.getContentSummary(convertToDefaultPath(f)); + } + + @Override public RemoteIterator listCorruptFileBlocks(Path path) throws IOException { + return fileSystem.listCorruptFileBlocks(convertToDefaultPath(path)); + } + + @Override public FileStatus[] listStatus(Path f, PathFilter filter) + throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertToDefaultPath(f), filter); + } + + @Override public FileStatus[] listStatus(Path[] files) + throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertDefaults(files)); + } + + @Override public FileStatus[] listStatus(Path[] files, PathFilter filter) + throws FileNotFoundException, IOException { + return fileSystem.listStatus(convertDefaults(files), filter); + } + + @Override public FileStatus[] globStatus(Path pathPattern) throws IOException { + return fileSystem.globStatus(convertToDefaultPath(pathPattern)); + } + + @Override public FileStatus[] globStatus(Path pathPattern, PathFilter filter) + throws IOException { + return fileSystem.globStatus(convertToDefaultPath(pathPattern), filter); + } + + @Override public RemoteIterator listLocatedStatus(Path f) + throws FileNotFoundException, IOException { + return fileSystem.listLocatedStatus(convertToDefaultPath(f)); + } + + @Override public RemoteIterator listFiles(Path f, boolean recursive) + throws FileNotFoundException, IOException { + return fileSystem.listFiles(convertToDefaultPath(f), recursive); + } + + @Override public Path getHomeDirectory() { + return convertToHoodiePath(fileSystem.getHomeDirectory()); + } + + @Override public boolean mkdirs(Path f) throws IOException { + return fileSystem.mkdirs(convertToDefaultPath(f)); + } + + @Override public void copyFromLocalFile(Path src, Path dst) throws IOException { + fileSystem.copyFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override public void moveFromLocalFile(Path[] srcs, Path dst) throws IOException { + fileSystem.moveFromLocalFile(convertDefaults(srcs), convertToDefaultPath(dst)); + } + + @Override public void moveFromLocalFile(Path src, Path dst) throws IOException { + fileSystem.moveFromLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override public void copyFromLocalFile(boolean delSrc, Path src, Path dst) throws IOException { + fileSystem.copyFromLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path[] srcs, Path dst) + throws IOException { + fileSystem + .copyFromLocalFile(delSrc, overwrite, convertDefaults(srcs), convertToDefaultPath(dst)); + } + + @Override public void copyFromLocalFile(boolean delSrc, boolean overwrite, Path src, Path dst) + throws IOException { + fileSystem.copyFromLocalFile(delSrc, overwrite, convertToDefaultPath(src), + convertToDefaultPath(dst)); + } + + @Override public void copyToLocalFile(Path src, Path dst) throws IOException { + fileSystem.copyToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override public void moveToLocalFile(Path src, Path dst) throws IOException { + fileSystem.moveToLocalFile(convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override public void copyToLocalFile(boolean delSrc, Path src, Path dst) throws IOException { + fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst)); + } + + @Override + public void copyToLocalFile(boolean delSrc, Path src, Path dst, boolean useRawLocalFileSystem) + throws IOException { + fileSystem.copyToLocalFile(delSrc, convertToDefaultPath(src), convertToDefaultPath(dst), + useRawLocalFileSystem); + } + + @Override public Path startLocalOutput(Path fsOutputFile, Path tmpLocalFile) + throws IOException { + return convertToHoodiePath(fileSystem.startLocalOutput(convertToDefaultPath(fsOutputFile), + convertToDefaultPath(tmpLocalFile))); + } + + @Override public void completeLocalOutput(Path fsOutputFile, Path tmpLocalFile) + throws IOException { + fileSystem.completeLocalOutput(convertToDefaultPath(fsOutputFile), + convertToDefaultPath(tmpLocalFile)); + } + + @Override public void close() throws IOException { + fileSystem.close(); + } + + @Override public long getUsed() throws IOException { + return fileSystem.getUsed(); + } + + @Override public long getBlockSize(Path f) throws IOException { + return fileSystem.getBlockSize(convertToDefaultPath(f)); + } + + @Override public long getDefaultBlockSize() { + return fileSystem.getDefaultBlockSize(); + } + + @Override public long getDefaultBlockSize(Path f) { + return fileSystem.getDefaultBlockSize(convertToDefaultPath(f)); + } + + @Override public short getDefaultReplication() { + return fileSystem.getDefaultReplication(); + } + + @Override public short getDefaultReplication(Path path) { + return fileSystem.getDefaultReplication(convertToDefaultPath(path)); + } + + @Override public void access(Path path, FsAction mode) + throws AccessControlException, FileNotFoundException, IOException { + fileSystem.access(convertToDefaultPath(path), mode); + } + + @Override public void createSymlink(Path target, Path link, boolean createParent) + throws AccessControlException, FileAlreadyExistsException, FileNotFoundException, + ParentNotDirectoryException, UnsupportedFileSystemException, IOException { + fileSystem + .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); + } + + @Override public FileStatus getFileLinkStatus(Path f) + throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException, + IOException { + return fileSystem.getFileLinkStatus(convertToDefaultPath(f)); + } + + @Override public boolean supportsSymlinks() { + return fileSystem.supportsSymlinks(); + } + + @Override public Path getLinkTarget(Path f) throws IOException { + return convertToHoodiePath(fileSystem.getLinkTarget(convertToDefaultPath(f))); + } + + @Override public FileChecksum getFileChecksum(Path f) throws IOException { + return fileSystem.getFileChecksum(convertToDefaultPath(f)); + } + + @Override public FileChecksum getFileChecksum(Path f, long length) throws IOException { + return fileSystem.getFileChecksum(convertToDefaultPath(f), length); + } + + @Override public void setVerifyChecksum(boolean verifyChecksum) { + fileSystem.setVerifyChecksum(verifyChecksum); + } + + @Override public void setWriteChecksum(boolean writeChecksum) { + fileSystem.setWriteChecksum(writeChecksum); + } + + @Override public FsStatus getStatus() throws IOException { + return fileSystem.getStatus(); + } + + @Override public FsStatus getStatus(Path p) throws IOException { + return fileSystem.getStatus(convertToDefaultPath(p)); + } + + @Override public void setPermission(Path p, FsPermission permission) throws IOException { + fileSystem.setPermission(convertToDefaultPath(p), permission); + } + + @Override public void setOwner(Path p, String username, String groupname) throws IOException { + fileSystem.setOwner(convertToDefaultPath(p), username, groupname); + } + + @Override public void setTimes(Path p, long mtime, long atime) throws IOException { + fileSystem.setTimes(convertToDefaultPath(p), mtime, atime); + } + + @Override public Path createSnapshot(Path path, String snapshotName) throws IOException { + return convertToHoodiePath( + fileSystem.createSnapshot(convertToDefaultPath(path), snapshotName)); + } + + @Override public void renameSnapshot(Path path, String snapshotOldName, String snapshotNewName) + throws IOException { + fileSystem.renameSnapshot(convertToDefaultPath(path), snapshotOldName, snapshotNewName); + } + + @Override public void deleteSnapshot(Path path, String snapshotName) throws IOException { + fileSystem.deleteSnapshot(convertToDefaultPath(path), snapshotName); + } + + @Override public void modifyAclEntries(Path path, List aclSpec) throws IOException { + fileSystem.modifyAclEntries(convertToDefaultPath(path), aclSpec); + } + + @Override public void removeAclEntries(Path path, List aclSpec) throws IOException { + fileSystem.removeAclEntries(convertToDefaultPath(path), aclSpec); + } + + @Override public void removeDefaultAcl(Path path) throws IOException { + fileSystem.removeDefaultAcl(convertToDefaultPath(path)); + } + + @Override public void removeAcl(Path path) throws IOException { + fileSystem.removeAcl(convertToDefaultPath(path)); + } + + @Override public void setAcl(Path path, List aclSpec) throws IOException { + fileSystem.setAcl(convertToDefaultPath(path), aclSpec); + } + + @Override public AclStatus getAclStatus(Path path) throws IOException { + return fileSystem.getAclStatus(convertToDefaultPath(path)); + } + + @Override public void setXAttr(Path path, String name, byte[] value) throws IOException { + fileSystem.setXAttr(convertToDefaultPath(path), name, value); + } + + @Override public void setXAttr(Path path, String name, byte[] value, EnumSet flag) + throws IOException { + fileSystem.setXAttr(convertToDefaultPath(path), name, value, flag); + } + + @Override public byte[] getXAttr(Path path, String name) throws IOException { + return fileSystem.getXAttr(convertToDefaultPath(path), name); + } + + @Override public Map getXAttrs(Path path) throws IOException { + return fileSystem.getXAttrs(convertToDefaultPath(path)); + } + + @Override public Map getXAttrs(Path path, List names) + throws IOException { + return fileSystem.getXAttrs(convertToDefaultPath(path), names); + } + + @Override public List listXAttrs(Path path) throws IOException { + return fileSystem.listXAttrs(convertToDefaultPath(path)); + } + + @Override public void removeXAttr(Path path, String name) throws IOException { + fileSystem.removeXAttr(convertToDefaultPath(path), name); + } + + @Override public void setConf(Configuration conf) { + // ignore this. we will set conf on init + } + + @Override public Configuration getConf() { + return fileSystem.getConf(); + } + + @Override public int hashCode() { + return fileSystem.hashCode(); + } + + @Override public boolean equals(Object obj) { + return fileSystem.equals(obj); + } + + @Override public String toString() { + return fileSystem.toString(); + } + + public Path convertToHoodiePath(Path oldPath) { + return convertPathWithScheme(oldPath, getHoodieScheme(fileSystem.getScheme())); + } + + public static Path convertToHoodiePath(Path file, Configuration conf) { + String scheme = FileSystem.getDefaultUri(conf).getScheme(); + return convertPathWithScheme(file, getHoodieScheme(scheme)); + } + + private Path convertToDefaultPath(Path oldPath) { + return convertPathWithScheme(oldPath, fileSystem.getScheme()); + } + + private Path[] convertDefaults(Path[] psrcs) { + Path[] psrcsNew = new Path[psrcs.length]; + for (int i = 0; i < psrcs.length; i++) { + psrcsNew[i] = convertToDefaultPath(psrcs[i]); + } + return psrcsNew; + } + + private static Path convertPathWithScheme(Path oldPath, String newScheme) { + URI oldURI = oldPath.toUri(); + URI newURI; + try { + newURI = new URI(newScheme, oldURI.getUserInfo(), oldURI.getHost(), oldURI.getPort(), + oldURI.getPath(), oldURI.getQuery(), oldURI.getFragment()); + return new Path(newURI); + } catch (URISyntaxException e) { + // TODO - Better Exception handling + throw new RuntimeException(e); + } + } + + public static String getHoodieScheme(String scheme) { + String newScheme; + if (SUPPORT_SCHEMES.contains(scheme)) { + newScheme = HOODIE_SCHEME_PREFIX + scheme; + } else { + throw new IllegalArgumentException( + "BlockAlignedAvroParquetWriter does not support schema " + scheme); + } + return newScheme; + } + + public long getBytesWritten(Path file) { + if (openStreams.containsKey(file.getName())) { + return openStreams.get(file.getName()).getBytesWritten(); + } + // When the file is first written, we do not have a track of it + throw new IllegalArgumentException(file.toString() + + " does not have a open stream. Cannot get the bytes written on the stream"); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java new file mode 100644 index 000000000..1c4dd9ae5 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/SizeAwareFSDataOutputStream.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io.storage; + +import org.apache.hadoop.fs.FSDataOutputStream; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Wrapper over FSDataOutputStream to keep track of the size of the written bytes. + * This gives a cheap way to check on the underlying file size. + */ +public class SizeAwareFSDataOutputStream extends FSDataOutputStream { + // A callback to call when the output stream is closed. + private final Runnable closeCallback; + // Keep track of the bytes written + private final AtomicLong bytesWritten = new AtomicLong(0L); + + public SizeAwareFSDataOutputStream(FSDataOutputStream out, Runnable closeCallback) + throws IOException { + super(out); + this.closeCallback = closeCallback; + } + + @Override public synchronized void write(byte[] b, int off, int len) throws IOException { + bytesWritten.addAndGet(len); + super.write(b, off, len); + } + + @Override public void write(byte[] b) throws IOException { + bytesWritten.addAndGet(b.length); + super.write(b); + } + + @Override public void close() throws IOException { + super.close(); + closeCallback.run(); + } + + public long getBytesWritten() { + return bytesWritten.get(); + } + +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java new file mode 100644 index 000000000..066064119 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/HoodieMetrics.java @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import com.codahale.metrics.Gauge; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.Timer; +import com.google.common.annotations.VisibleForTesting; +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +/** + * Wrapper for metrics-related operations. + */ +public class HoodieMetrics { + private HoodieWriteConfig config = null; + private String tableName = null; + private static Logger logger = LogManager.getLogger(HoodieMetrics.class); + // Some timers + public String rollbackTimerName = null; + public String cleanTimerName = null; + public String commitTimerName = null; + private Timer rollbackTimer = null; + private Timer cleanTimer = null; + private Timer commitTimer = null; + + public HoodieMetrics(HoodieWriteConfig config, String tableName) { + this.config = config; + this.tableName = tableName; + if (config.isMetricsOn()) { + Metrics.init(config); + this.rollbackTimerName = getMetricsName("timer", "rollback"); + this.cleanTimerName = getMetricsName("timer", "clean"); + this.commitTimerName = getMetricsName("timer", "commit"); + } + } + + private Timer createTimer(String name) { + return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null; + } + + public Timer.Context getRollbackCtx() { + if (config.isMetricsOn() && rollbackTimer == null) { + rollbackTimer = createTimer(rollbackTimerName); + } + return rollbackTimer == null ? null : rollbackTimer.time(); + } + + public Timer.Context getCleanCtx() { + if (config.isMetricsOn() && cleanTimer == null) { + cleanTimer = createTimer(cleanTimerName); + } + return cleanTimer == null ? null : cleanTimer.time(); + } + + public Timer.Context getCommitCtx() { + if (config.isMetricsOn() && commitTimer == null) { + commitTimer = createTimer(commitTimerName); + } + return commitTimer == null ? null : commitTimer.time(); + } + + public void updateCommitMetrics(long commitEpochTimeInMs, long durationInMs, HoodieCommitMetadata metadata) { + if (config.isMetricsOn()) { + long totalPartitionsWritten = metadata.fetchTotalPartitionsWritten(); + long totalFilesInsert = metadata.fetchTotalFilesInsert(); + long totalFilesUpdate = metadata.fetchTotalFilesUpdated(); + long totalRecordsWritten = metadata.fetchTotalRecordsWritten(); + long totalUpdateRecordsWritten = metadata.fetchTotalUpdateRecordsWritten(); + long totalInsertRecordsWritten = metadata.fetchTotalInsertRecordsWritten(); + long totalBytesWritten = metadata.fetchTotalBytesWritten(); + registerGauge(getMetricsName("commit", "duration"), durationInMs); + registerGauge(getMetricsName("commit", "totalPartitionsWritten"), totalPartitionsWritten); + registerGauge(getMetricsName("commit", "totalFilesInsert"), totalFilesInsert); + registerGauge(getMetricsName("commit", "totalFilesUpdate"), totalFilesUpdate); + registerGauge(getMetricsName("commit", "totalRecordsWritten"), totalRecordsWritten); + registerGauge(getMetricsName("commit", "totalUpdateRecordsWritten"), totalUpdateRecordsWritten); + registerGauge(getMetricsName("commit", "totalInsertRecordsWritten"), totalInsertRecordsWritten); + registerGauge(getMetricsName("commit", "totalBytesWritten"), totalBytesWritten); + registerGauge(getMetricsName("commit", "commitTime"), commitEpochTimeInMs); + } + } + + public void updateRollbackMetrics(long durationInMs, int numFilesDeleted) { + if (config.isMetricsOn()) { + logger.info(String.format("Sending rollback metrics (duration=%d, numFilesDeleted=$d)", + durationInMs, numFilesDeleted)); + registerGauge(getMetricsName("rollback", "duration"), durationInMs); + registerGauge(getMetricsName("rollback", "numFilesDeleted"), numFilesDeleted); + } + } + + public void updateCleanMetrics(long durationInMs, int numFilesDeleted) { + if (config.isMetricsOn()) { + logger.info(String.format("Sending clean metrics (duration=%d, numFilesDeleted=%d)", + durationInMs, numFilesDeleted)); + registerGauge(getMetricsName("clean", "duration"), durationInMs); + registerGauge(getMetricsName("clean", "numFilesDeleted"), numFilesDeleted); + } + } + + @VisibleForTesting + String getMetricsName(String action, String metric) { + return config == null ? null : + String.format("%s.%s.%s", tableName, action, metric); + } + + void registerGauge(String metricName, final long value) { + try { + MetricRegistry registry = Metrics.getInstance().getRegistry(); + registry.register(metricName, new Gauge() { + @Override + public Long getValue() { + return value; + } + }); + } catch (Exception e) { + // Here we catch all exception, so the major upsert pipeline will not be affected if the metrics system + // has some issues. + logger.error("Failed to send metrics: ", e); + } + } + + /** + * By default, the timer context returns duration with nano seconds. + * Convert it to millisecond. + */ + public long getDurationInMs(long ctxDuration) { + return ctxDuration / 1000000; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java new file mode 100644 index 000000000..e3511b523 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/InMemoryMetricsReporter.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import java.io.Closeable; + +/** + * Used for testing. + */ +public class InMemoryMetricsReporter extends MetricsReporter { + @Override + public void start() { + } + + @Override + public void report() { + } + + @Override + public Closeable getReporter() { + return null; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java new file mode 100644 index 000000000..4158814c6 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/Metrics.java @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import com.codahale.metrics.MetricRegistry; +import com.google.common.io.Closeables; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.config.HoodieMetricsConfig; +import com.uber.hoodie.exception.HoodieException; +import org.apache.commons.configuration.ConfigurationException; + +import java.io.Closeable; + +/** + * This is the main class of the metrics system. To use it, + * users need to call the {@link #init(HoodieMetricsConfig) init} method to initialize the system. + * Input for {@link #init(HoodieMetricsConfig) init} includes a configuration object, where + * users can specify the reporter type, and special configs for that reporter. + * Refer to {@see MetricsConfiguration} for more configurable fields. + */ +public class Metrics { + private static volatile boolean initialized = false; + private static Metrics metrics = null; + private final MetricRegistry registry; + private MetricsReporter reporter = null; + + private Metrics(HoodieWriteConfig metricConfig) throws ConfigurationException { + registry = new MetricRegistry(); + + reporter = MetricsReporterFactory.createReporter(metricConfig, registry); + if (reporter == null) { + throw new RuntimeException("Cannot initialize Reporter."); + } +// reporter.start(); + + Runtime.getRuntime().addShutdownHook(new Thread() { + @Override + public void run() { + try { + reporter.report(); + Closeables.close(reporter.getReporter(), true); + } catch (Exception e) { + e.printStackTrace(); + } + } + }); + } + + public static Metrics getInstance() { + assert initialized; + return metrics; + } + + public static synchronized void init(HoodieWriteConfig metricConfig) { + if (initialized) { + return; + } + try { + metrics = new Metrics(metricConfig); + } catch (ConfigurationException e) { + throw new HoodieException(e); + } + initialized = true; + } + + public MetricRegistry getRegistry() { + return registry; + } + + public Closeable getReporter() { + return reporter.getReporter(); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java new file mode 100644 index 000000000..aeb5464d4 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsGraphiteReporter.java @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; +import com.codahale.metrics.graphite.Graphite; +import com.codahale.metrics.graphite.GraphiteReporter; +import com.uber.hoodie.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.Closeable; +import java.net.InetSocketAddress; +import java.util.concurrent.TimeUnit; + +/** + * Implementation of Graphite reporter, which connects to the Graphite server, + * and send metrics to that server. + */ +public class MetricsGraphiteReporter extends MetricsReporter { + private final MetricRegistry registry; + private final GraphiteReporter graphiteReporter; + private final HoodieWriteConfig config; + private String serverHost; + private int serverPort; + + private static Logger logger = LogManager.getLogger(MetricsGraphiteReporter.class); + + public MetricsGraphiteReporter(HoodieWriteConfig config, MetricRegistry registry) { + this.registry = registry; + this.config = config; + + // Check the serverHost and serverPort here + this.serverHost = config.getGraphiteServerHost(); + this.serverPort = config.getGraphiteServerPort(); + if (serverHost == null || serverPort == 0) { + throw new RuntimeException( + String.format("Graphite cannot be initialized with serverHost[%s] and serverPort[%s].", + serverHost, serverPort)); + } + + this.graphiteReporter = createGraphiteReport(); + } + + @Override + public void start() { + if (graphiteReporter != null) { + graphiteReporter.start(30, TimeUnit.SECONDS); + } else { + logger.error("Cannot start as the graphiteReporter is null."); + } + } + + @Override + public void report() { + if (graphiteReporter != null) { + graphiteReporter.report(); + } else { + logger.error("Cannot report metrics as the graphiteReporter is null."); + } + } + + @Override + public Closeable getReporter() { + return graphiteReporter; + } + + private GraphiteReporter createGraphiteReport() { + Graphite graphite = new Graphite( + new InetSocketAddress(serverHost, serverPort)); + String reporterPrefix = config.getGraphiteMetricPrefix(); + return GraphiteReporter.forRegistry(registry) + .prefixedWith(reporterPrefix) + .convertRatesTo(TimeUnit.SECONDS) + .convertDurationsTo(TimeUnit.MILLISECONDS) + .filter(MetricFilter.ALL) + .build(graphite); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java new file mode 100644 index 000000000..719e7c6a3 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporter.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import java.io.Closeable; + +/** + * Interface for implementing a Reporter. + */ +public abstract class MetricsReporter { + /** + * Push out metrics at scheduled intervals + */ + public abstract void start(); + + /** + * Deterministically push out metrics + */ + public abstract void report(); + + public abstract Closeable getReporter(); +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java new file mode 100644 index 000000000..3c0d9e667 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterFactory.java @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import com.codahale.metrics.MetricRegistry; +import com.uber.hoodie.config.HoodieWriteConfig; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +/** + * Factory class for creating MetricsReporter. + */ +public class MetricsReporterFactory { + private static Logger logger = LogManager.getLogger(MetricsReporterFactory.class); + + public static MetricsReporter createReporter(HoodieWriteConfig config, + MetricRegistry registry) { + MetricsReporterType type = config.getMetricsReporterType(); + MetricsReporter reporter = null; + switch (type) { + case GRAPHITE: + reporter = new MetricsGraphiteReporter(config, registry); + break; + case INMEMORY: + reporter = new InMemoryMetricsReporter(); + break; + default: + logger.error("Reporter type[" + type + "] is not supported."); + break; + } + return reporter; + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java new file mode 100644 index 000000000..cac162cec --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/metrics/MetricsReporterType.java @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +/** + * Types of the reporter. Right now we only support Graphite. + * We can include JMX and CSV in the future. + */ +public enum MetricsReporterType { + GRAPHITE, + INMEMORY +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java new file mode 100644 index 000000000..43455d208 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.table; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieUpsertException; +import com.uber.hoodie.func.LazyInsertIterable; +import com.uber.hoodie.io.HoodieUpdateHandle; + +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.avro.AvroParquetReader; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.spark.Partitioner; + +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +import scala.Option; +import scala.Tuple2; + +/** + * Implementation of a very heavily read-optimized Hoodie Table where + * + * INSERTS - Produce new files, block aligned to desired size (or) + * Merge with the smallest existing file, to expand it + * + * UPDATES - Produce a new version of the file containing the invalidated records + * + */ +public class HoodieCopyOnWriteTable extends HoodieTable { + + // seed for random number generator. No particular significance, just makes testing deterministic + private static final long RANDOM_NUMBER_SEED = 356374L; + + + private static Logger logger = LogManager.getLogger(HoodieCopyOnWriteTable.class); + + enum BucketType { + UPDATE, + INSERT + } + + /** + * Helper class for a small file's location and its actual size on disk + */ + class SmallFile implements Serializable { + HoodieRecordLocation location; + long sizeBytes; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SmallFile {"); + sb.append("location=").append(location).append(", "); + sb.append("sizeBytes=").append(sizeBytes); + sb.append('}'); + return sb.toString(); + } + } + + /** + * Helper class for an insert bucket along with the weight [0.0, 0.1] + * that defines the amount of incoming inserts that should be allocated to + * the bucket + */ + class InsertBucket implements Serializable { + int bucketNumber; + // fraction of total inserts, that should go into this bucket + double weight; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WorkloadStat {"); + sb.append("bucketNumber=").append(bucketNumber).append(", "); + sb.append("weight=").append(weight); + sb.append('}'); + return sb.toString(); + } + } + + /** + * Helper class for a bucket's type (INSERT & UPDATE) and its file location + */ + class BucketInfo implements Serializable { + BucketType bucketType; + String fileLoc; + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("BucketInfo {"); + sb.append("bucketType=").append(bucketType).append(", "); + sb.append("fileLoc=").append(fileLoc); + sb.append('}'); + return sb.toString(); + } + } + + + public HoodieCopyOnWriteTable(String commitTime, HoodieWriteConfig config, HoodieTableMetadata metadata) { + super(commitTime, config, metadata); + } + + /** + * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition) + */ + class UpsertPartitioner extends Partitioner { + + /** + * Total number of RDD partitions, is determined by total buckets we want to + * pack the incoming workload into + */ + private int totalBuckets = 0; + + /** + * Helps decide which bucket an incoming update should go to. + */ + private HashMap updateLocationToBucket; + + + /** + * Helps us pack inserts into 1 or more buckets depending on number of + * incoming records. + */ + private HashMap> partitionPathToInsertBuckets; + + + /** + * Remembers what type each bucket is for later. + */ + private HashMap bucketInfoMap; + + + /** + * Random number generator to use for splitting inserts into buckets by weight + */ + private Random rand = new Random(RANDOM_NUMBER_SEED); + + + UpsertPartitioner(WorkloadProfile profile) { + updateLocationToBucket = new HashMap<>(); + partitionPathToInsertBuckets = new HashMap<>(); + bucketInfoMap = new HashMap<>(); + + assignUpdates(profile); + assignInserts(profile); + + logger.info("Total Buckets :" + totalBuckets + ", " + + "buckets info => " + bucketInfoMap + ", \n" + + "Partition to insert buckets => " + partitionPathToInsertBuckets + ", \n" + + "UpdateLocations mapped to buckets =>" + updateLocationToBucket); + } + + private void assignUpdates(WorkloadProfile profile) { + // each update location gets a partition + WorkloadStat gStat = profile.getGlobalStat(); + for (Map.Entry updateLocEntry : gStat.getUpdateLocationToCount().entrySet()) { + addUpdateBucket(updateLocEntry.getKey()); + } + } + + private int addUpdateBucket(String fileLoc) { + int bucket = totalBuckets; + updateLocationToBucket.put(fileLoc, bucket); + BucketInfo bucketInfo = new BucketInfo(); + bucketInfo.bucketType = BucketType.UPDATE; + bucketInfo.fileLoc = fileLoc; + bucketInfoMap.put(totalBuckets, bucketInfo); + totalBuckets++; + return bucket; + } + + private void assignInserts(WorkloadProfile profile) { + // for new inserts, compute buckets depending on how many records we have for each partition + Set partitionPaths = profile.getPartitionPaths(); + long averageRecordSize = averageBytesPerRecord(); + logger.info("AvgRecordSize => " + averageRecordSize); + for (String partitionPath : partitionPaths) { + WorkloadStat pStat = profile.getWorkloadStat(partitionPath); + if (pStat.getNumInserts() > 0) { + + List smallFiles = getSmallFiles(partitionPath); + logger.info("For partitionPath : "+ partitionPath + " Small Files => " + smallFiles); + + long totalUnassignedInserts = pStat.getNumInserts(); + List bucketNumbers = new ArrayList<>(); + List recordsPerBucket = new ArrayList<>(); + + // first try packing this into one of the smallFiles + for (SmallFile smallFile: smallFiles) { + long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes)/ averageRecordSize, totalUnassignedInserts); + if (recordsToAppend > 0 && totalUnassignedInserts > 0){ + // create a new bucket or re-use an existing bucket + int bucket; + if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) { + bucket = updateLocationToBucket.get(smallFile.location.getFileId()); + logger.info("Assigning " + recordsToAppend + " inserts to existing update bucket "+ bucket); + } else { + bucket = addUpdateBucket(smallFile.location.getFileId()); + logger.info("Assigning " + recordsToAppend + " inserts to new update bucket "+ bucket); + } + bucketNumbers.add(bucket); + recordsPerBucket.add(recordsToAppend); + totalUnassignedInserts -= recordsToAppend; + } + } + + // if we have anything more, create new insert buckets, like normal + if (totalUnassignedInserts > 0) { + long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize(); + if (config.shouldAutoTuneInsertSplits()) { + insertRecordsPerBucket = config.getParquetMaxFileSize()/averageRecordSize; + } + + int insertBuckets = (int) Math.max(totalUnassignedInserts / insertRecordsPerBucket, 1L); + logger.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + + ", totalInsertBuckets => " + insertBuckets + + ", recordsPerBucket => " + insertRecordsPerBucket); + for (int b = 0; b < insertBuckets; b++) { + bucketNumbers.add(totalBuckets); + recordsPerBucket.add(totalUnassignedInserts/insertBuckets); + BucketInfo bucketInfo = new BucketInfo(); + bucketInfo.bucketType = BucketType.INSERT; + bucketInfoMap.put(totalBuckets, bucketInfo); + totalBuckets++; + } + } + + // Go over all such buckets, and assign weights as per amount of incoming inserts. + List insertBuckets = new ArrayList<>(); + for (int i = 0; i < bucketNumbers.size(); i++) { + InsertBucket bkt = new InsertBucket(); + bkt.bucketNumber = bucketNumbers.get(i); + bkt.weight = (1.0 * recordsPerBucket.get(i))/pStat.getNumInserts(); + insertBuckets.add(bkt); + } + logger.info("Total insert buckets for partition path "+ partitionPath + " => " + insertBuckets); + partitionPathToInsertBuckets.put(partitionPath, insertBuckets); + } + } + } + + + /** + * Returns a list of small files in the given partition path + * + * @param partitionPath + * @return + */ + private List getSmallFiles(String partitionPath) { + FileSystem fs = FSUtils.getFs(); + List smallFileLocations = new ArrayList<>(); + + if (metadata.getAllCommits().getNumCommits() > 0) { // if we have some commits + String latestCommitTime = metadata.getAllCommits().lastCommit(); + FileStatus[] allFiles = metadata.getLatestVersionInPartition(fs, partitionPath, latestCommitTime); + + if (allFiles != null && allFiles.length > 0) { + for (FileStatus fileStatus : allFiles) { + if (fileStatus.getLen() < config.getParquetSmallFileLimit()) { + String filename = fileStatus.getPath().getName(); + SmallFile sf = new SmallFile(); + sf.location = new HoodieRecordLocation( + FSUtils.getCommitTime(filename), + FSUtils.getFileId(filename)); + sf.sizeBytes = fileStatus.getLen(); + smallFileLocations.add(sf); + } + } + } + } + + return smallFileLocations; + } + + /** + * Obtains the average record size based on records written during last commit. + * Used for estimating how many records pack into one file. + * + * @return + */ + private long averageBytesPerRecord() { + long avgSize = 0L; + try { + if (metadata.getAllCommits().getNumCommits() > 0) { + String latestCommitTime = metadata.getAllCommits().lastCommit(); + HoodieCommitMetadata commitMetadata = metadata.getCommitMetadata(latestCommitTime); + avgSize =(long) Math.ceil((1.0 * commitMetadata.fetchTotalBytesWritten())/commitMetadata.fetchTotalRecordsWritten()); + } + } catch (Throwable t) { + // make this fail safe. + logger.error("Error trying to compute average bytes/record ", t); + } + return avgSize <= 0L ? config.getCopyOnWriteRecordSizeEstimate() : avgSize; + } + + public BucketInfo getBucketInfo(int bucketNumber) { + return bucketInfoMap.get(bucketNumber); + } + + public List getInsertBuckets(String partitionPath) { + return partitionPathToInsertBuckets.get(partitionPath); + } + + @Override + public int numPartitions() { + return totalBuckets; + } + + @Override + public int getPartition(Object key) { + Tuple2> keyLocation = (Tuple2>) key; + if (keyLocation._2().isDefined()) { + HoodieRecordLocation location = keyLocation._2().get(); + return updateLocationToBucket.get(location.getFileId()); + } else { + List targetBuckets = partitionPathToInsertBuckets.get(keyLocation._1().getPartitionPath()); + // pick the target bucket to use based on the weights. + double totalWeight = 0.0; + double r = rand.nextDouble(); + for (InsertBucket insertBucket: targetBuckets) { + totalWeight += insertBucket.weight; + if (r <= totalWeight) { + return insertBucket.bucketNumber; + } + } + // return first one, by default + return targetBuckets.get(0).bucketNumber; + } + } + } + + + @Override + public Partitioner getUpsertPartitioner(WorkloadProfile profile) { + if (profile == null) { + throw new HoodieUpsertException("Need workload profile to construct the upsert partitioner."); + } + return new UpsertPartitioner(profile); + } + + @Override + public Partitioner getInsertPartitioner(WorkloadProfile profile) { + return null; + } + + @Override + public boolean isWorkloadProfileNeeded() { + return true; + } + + + + public Iterator> handleUpdate(String fileLoc, Iterator> recordItr) throws Exception { + // these are updates + HoodieUpdateHandle upsertHandle = + new HoodieUpdateHandle<>(config, commitTime, metadata, recordItr, fileLoc); + if (upsertHandle.getOldFilePath() == null) { + logger.error("Error in finding the old file path at commit " + commitTime); + } else { + Configuration conf = FSUtils.getFs().getConf(); + AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema()); + ParquetReader reader = + AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(conf).build(); + try { + IndexedRecord record; + while ((record = reader.read()) != null) { + // Two types of writes here (new record, and old record). + // We have already catch the exception during writing new records. + // But for old records, we should fail if any exception happens. + upsertHandle.write((GenericRecord) record); + } + } catch (IOException e) { + throw new HoodieUpsertException( + "Failed to read record from " + upsertHandle.getOldFilePath() + + " with new Schema " + upsertHandle.getSchema(), e); + } finally { + reader.close(); + upsertHandle.close(); + } + } + if (upsertHandle.getWriteStatus().getPartitionPath() == null) { + logger.info( + "Upsert Handle has partition path as null " + upsertHandle.getOldFilePath() + + ", " + upsertHandle.getWriteStatus()); + } + return Collections.singletonList(Collections.singletonList(upsertHandle.getWriteStatus())).iterator(); + } + + public Iterator> handleInsert(Iterator> recordItr) throws Exception { + return new LazyInsertIterable<>(recordItr, config, commitTime, metadata); + } + + + @Override + public Iterator> handleUpsertPartition(Integer partition, + Iterator recordItr, + Partitioner partitioner) { + UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner; + BucketInfo binfo = upsertPartitioner.getBucketInfo(partition); + BucketType btype = binfo.bucketType; + try { + if (btype.equals(BucketType.INSERT)) { + return handleInsert(recordItr); + } else if (btype.equals(BucketType.UPDATE)) { + return handleUpdate(binfo.fileLoc, recordItr); + } else { + throw new HoodieUpsertException("Unknown bucketType " + btype + " for partition :" + partition); + } + } catch (Throwable t) { + String msg = "Error upserting bucketType " + btype + " for partition :" + partition; + logger.error(msg, t); + throw new HoodieUpsertException(msg, t); + } + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java new file mode 100644 index 000000000..9bf2f59fe --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.table; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTableType; +import com.uber.hoodie.exception.HoodieException; + +import org.apache.spark.Partitioner; + +import java.io.Serializable; +import java.util.Iterator; +import java.util.List; + +/** + * Abstract implementation of a HoodieTable + */ +public abstract class HoodieTable implements Serializable { + + protected final String commitTime; + + protected final HoodieWriteConfig config; + + protected final HoodieTableMetadata metadata; + + protected HoodieTable(String commitTime, HoodieWriteConfig config, HoodieTableMetadata metadata) { + this.commitTime = commitTime; + this.config = config; + this.metadata = metadata; + } + + /** + * Provides a partitioner to perform the upsert operation, based on the + * workload profile + * + * @return + */ + public abstract Partitioner getUpsertPartitioner(WorkloadProfile profile); + + + /** + * Provides a partitioner to perform the insert operation, based on the workload profile + * + * @return + */ + public abstract Partitioner getInsertPartitioner(WorkloadProfile profile); + + + /** + * Return whether this HoodieTable implementation can benefit from workload + * profiling + * + * @return + */ + public abstract boolean isWorkloadProfileNeeded(); + + + /** + * Perform the ultimate IO for a given upserted (RDD) partition + * + * @param partition + * @param recordIterator + * @param partitioner + */ + public abstract Iterator> handleUpsertPartition(Integer partition, + Iterator> recordIterator, + Partitioner partitioner); + + + public static HoodieTable getHoodieTable(HoodieTableType type, + String commitTime, + HoodieWriteConfig config, + HoodieTableMetadata metadata) { + if (type == HoodieTableType.COPY_ON_WRITE) { + return new HoodieCopyOnWriteTable(commitTime, config, metadata); + } else { + throw new HoodieException("Unsupported table type :"+ type); + } + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java new file mode 100644 index 000000000..21ca4eaf2 --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadProfile.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.table; + + +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieRecordPayload; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.PairFunction; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import scala.Option; +import scala.Tuple2; + +/** + * Information about incoming records for upsert/insert obtained either via sampling or + * introspecting the data fully + * + * TODO(vc): Think about obtaining this directly from index.tagLocation + */ +public class WorkloadProfile implements Serializable { + + /** + * Input workload + */ + private final JavaRDD> taggedRecords; + + /** + * Computed workload profile + */ + private final HashMap partitionPathStatMap; + + + private final WorkloadStat globalStat; + + + public WorkloadProfile(JavaRDD> taggedRecords) { + this.taggedRecords = taggedRecords; + this.partitionPathStatMap = new HashMap<>(); + this.globalStat = new WorkloadStat(); + buildProfile(); + } + + private void buildProfile() { + + Map>, Object> partitionLocationCounts = + taggedRecords.mapToPair(new PairFunction, Tuple2>, HoodieRecord>() { + @Override + public Tuple2>, HoodieRecord> call(HoodieRecord record) throws Exception { + return new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.apply(record.getCurrentLocation())), record); + } + }).countByKey(); + + for (Map.Entry>, Object> e: partitionLocationCounts.entrySet()) { + String partitionPath = e.getKey()._1(); + Long count = (Long) e.getValue(); + Option locOption = e.getKey()._2(); + + if (!partitionPathStatMap.containsKey(partitionPath)){ + partitionPathStatMap.put(partitionPath, new WorkloadStat()); + } + + if (locOption.isDefined()) { + // update + partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count); + globalStat.addUpdates(locOption.get(), count); + } else { + // insert + partitionPathStatMap.get(partitionPath).addInserts(count); + globalStat.addInserts(count); + } + } + } + + public WorkloadStat getGlobalStat() { + return globalStat; + } + + public Set getPartitionPaths() { + return partitionPathStatMap.keySet(); + } + + public WorkloadStat getWorkloadStat(String partitionPath){ + return partitionPathStatMap.get(partitionPath); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WorkloadProfile {"); + sb.append("globalStat=").append(globalStat).append(", "); + sb.append("partitionStat=").append(partitionPathStatMap); + sb.append('}'); + return sb.toString(); + } +} diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java new file mode 100644 index 000000000..a0eea477a --- /dev/null +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/WorkloadStat.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.table; + +import com.uber.hoodie.common.model.HoodieRecordLocation; + +import java.io.Serializable; +import java.util.HashMap; + +/** + * Wraps stats about a single partition path. + */ +public class WorkloadStat implements Serializable { + private long numInserts = 0L; + + private long numUpdates = 0L; + + private HashMap updateLocationToCount; + + public WorkloadStat() { + updateLocationToCount = new HashMap<>(); + } + + long addInserts(long numInserts) { + return this.numInserts += numInserts; + } + + long addUpdates(HoodieRecordLocation location, long numUpdates) { + updateLocationToCount.put(location.getFileId(), numUpdates); + return this.numUpdates += numUpdates; + } + + public long getNumUpdates() { + return numUpdates; + } + + public long getNumInserts() { + return numInserts; + } + + public HashMap getUpdateLocationToCount() { + return updateLocationToCount; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("WorkloadStat {"); + sb.append("numInserts=").append(numInserts).append(", "); + sb.append("numUpdates=").append(numUpdates); + sb.append('}'); + return sb.toString(); + } +} diff --git a/hoodie-client/src/main/resources/log4j.properties b/hoodie-client/src/main/resources/log4j.properties new file mode 100644 index 000000000..5a8b643fd --- /dev/null +++ b/hoodie-client/src/main/resources/log4j.properties @@ -0,0 +1,23 @@ +# +# Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java new file mode 100644 index 000000000..be7d3ad87 --- /dev/null +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import com.uber.hoodie.HoodieWriteClient; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.common.HoodieTestDataGenerator; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.index.HoodieIndex; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import java.util.List; + +/** + * Driver program that uses the Hoodie client with synthetic workload, and performs basic + * operations.

+ */ +public class HoodieClientExample { + + + private static Logger logger = LogManager.getLogger(HoodieClientExample.class); + + public static void main(String[] args) throws Exception { + + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example"); + sparkConf.setMaster("local[1]"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.kryoserializer.buffer.max", "512m"); + JavaSparkContext jsc = new JavaSparkContext(sparkConf); + + // generate some records to be loaded in. + HoodieWriteConfig cfg = + HoodieWriteConfig.newBuilder().withPath("file:///tmp/hoodie/sample-table") + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("sample-table").withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + logger.info("Starting commit " + newCommitTime); + + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + client.upsert(writeRecords, newCommitTime); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + logger.info("Starting commit " + newCommitTime); + records.addAll(dataGen.generateUpdates(newCommitTime, 100)); + + writeRecords = jsc.parallelize(records, 1); + client.upsert(writeRecords, newCommitTime); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java new file mode 100644 index 000000000..7b84abe72 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClient.java @@ -0,0 +1,634 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie; + +import com.google.common.collect.Iterables; + +import com.uber.hoodie.common.HoodieClientTestUtils; +import com.uber.hoodie.common.HoodieTestDataGenerator; +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.model.HoodieCommits; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.model.HoodieWriteStat; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.ParquetUtils; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.config.HoodieCompactionConfig; +import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieStorageConfig; +import com.uber.hoodie.exception.HoodieRollbackException; +import com.uber.hoodie.index.HoodieIndex; +import com.uber.hoodie.io.HoodieCleaner; + +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SQLContext; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeSet; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class TestHoodieClient implements Serializable { + private transient JavaSparkContext jsc = null; + private transient SQLContext sqlContext; + private String basePath = null; + private transient HoodieTestDataGenerator + dataGen = null; + + @Before + public void init() throws IOException { + // Initialize a local spark env + SparkConf sparkConf = new SparkConf().setAppName("TestHoodieClient").setMaster("local[4]"); + jsc = new JavaSparkContext(HoodieReadClient.addHoodieSupport(sparkConf)); + + //SQLContext stuff + sqlContext = new SQLContext(jsc); + + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.initializeHoodieDirectory(basePath); + + dataGen = new HoodieTestDataGenerator(); + } + + + private HoodieWriteConfig getConfig() { + return getConfigBuilder().build(); + } + + private HoodieWriteConfig.Builder getConfigBuilder() { + return HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build()) + .forTable("test-trip-table").withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()); + } + + private void assertNoWriteErrors(List statuses) { + // Verify there are no errors + for (WriteStatus status : statuses) { + assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors()); + } + } + + private void checkTaggedRecords(List taggedRecords, String commitTime) { + for (HoodieRecord rec : taggedRecords) { + assertTrue("Record " + rec + " found with no location.", rec.isCurrentLocationKnown()); + assertEquals("All records should have commit time "+ commitTime+", since updates were made", + rec.getCurrentLocation().getCommitTime(), commitTime); + } + } + + @Test + public void testFilterExist() throws Exception { + HoodieWriteConfig config = getConfig(); + HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); + String newCommitTime = writeClient.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 100); + JavaRDD recordsRDD = jsc.parallelize(records, 1); + + HoodieReadClient readClient = new HoodieReadClient(jsc, config.getBasePath()); + JavaRDD filteredRDD = readClient.filterExists(recordsRDD); + + // Should not find any files + assertTrue(filteredRDD.collect().size() == 100); + + JavaRDD smallRecordsRDD = jsc.parallelize(records.subList(0, 75), 1); + // We create three parquet file, each having one record. (two different partitions) + List statuses = writeClient.insert(smallRecordsRDD, newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + readClient = new HoodieReadClient(jsc, config.getBasePath()); + filteredRDD = readClient.filterExists(recordsRDD); + List result = filteredRDD.collect(); + // Check results + assertTrue(result.size() == 25); + } + + @Test + public void testUpserts() throws Exception { + HoodieWriteConfig cfg = getConfig(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); + + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + List records = dataGen.generateInserts(newCommitTime, 200); + JavaRDD writeRecords = jsc.parallelize(records, 1); + + List statuses = client.upsert(writeRecords, newCommitTime).collect(); + assertNoWriteErrors(statuses); + + // verify that there is a commit + HoodieReadClient readClient = new HoodieReadClient(jsc, basePath, sqlContext); + assertEquals("Expecting a single commit.", readClient.listCommitsSince("000").size(), 1); + assertEquals("Latest commit should be 001",readClient.latestCommit(), newCommitTime); + assertEquals("Must contain 200 records", readClient.readCommit(newCommitTime).count(), records.size()); + // Should have 100 records in table (check using Index), all in locations marked at commit + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetadata(fs, basePath)).collect(); + checkTaggedRecords(taggedRecords, "001"); + + /** + * Write 2 (updates) + */ + newCommitTime = "004"; + records = dataGen.generateUpdates(newCommitTime, 100); + LinkedHashMap recordsMap = new LinkedHashMap<>(); + for (HoodieRecord rec : records) { + if (!recordsMap.containsKey(rec.getKey())) { + recordsMap.put(rec.getKey(), rec); + } + } + List dedupedRecords = new ArrayList<>(recordsMap.values()); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // verify there are now 2 commits + readClient = new HoodieReadClient(jsc, basePath, sqlContext); + assertEquals("Expecting two commits.", readClient.listCommitsSince("000").size(), 2); + assertEquals("Latest commit should be 004",readClient.latestCommit(), newCommitTime); + + // Index should be able to locate all updates in correct locations. + taggedRecords = index.tagLocation(jsc.parallelize(dedupedRecords, 1), new HoodieTableMetadata(fs, basePath)).collect(); + checkTaggedRecords(taggedRecords, "004"); + + // Check the entire dataset has 100 records still + String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length]; + for (int i=0; i < fullPartitionPaths.length; i++) { + fullPartitionPaths[i] = String.format("%s/%s/*", basePath, dataGen.getPartitionPaths()[i]); + } + assertEquals("Must contain 200 records", readClient.read(fullPartitionPaths).count(), 200); + + + // Check that the incremental consumption from time 000 + assertEquals("Incremental consumption from time 002, should give all records in commit 004", + readClient.readCommit(newCommitTime).count(), + readClient.readSince("002").count()); + assertEquals("Incremental consumption from time 001, should give all records in commit 004", + readClient.readCommit(newCommitTime).count(), + readClient.readSince("001").count()); + } + + @Test + public void testInsertAndCleanByVersions() throws Exception { + int maxVersions = 2; // keep upto 2 versions for each file + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaner.CleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainFileVersions(maxVersions).build()).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); + + /** + * do a big insert + * (this is basically same as insert part of upsert, just adding it here so we can + * catch breakages in insert(), if the implementation diverges.) + */ + String newCommitTime = client.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 500); + JavaRDD writeRecords = jsc.parallelize(records, 5); + + List statuses = client.insert(writeRecords, newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // verify that there is a commit + + assertEquals("Expecting a single commit.", new HoodieReadClient(jsc, basePath).listCommitsSince("000").size(), 1); + // Should have 100 records in table (check using Index), all in locations marked at commit + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetadata(fs, basePath)).collect(); + checkTaggedRecords(taggedRecords, newCommitTime); + + // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. + for (int writeCnt = 2; writeCnt < 10; writeCnt++) { + + Thread.sleep(1100); // make sure commits are unique + newCommitTime = client.startCommit(); + records = dataGen.generateUpdates(newCommitTime, 100); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + SortedMap commitMetadata = metadata.getAllCommitMetadata(); + + // Need to ensure the following + for (String partitionPath : dataGen.getPartitionPaths()) { + // compute all the versions of all files, from time 0 + HashMap> fileIdToVersions = new HashMap<>(); + for (Map.Entry entry : commitMetadata.entrySet()) { + for (HoodieWriteStat wstat : entry.getValue().getWriteStats(partitionPath)) { + if (!fileIdToVersions.containsKey(wstat.getFileId())) { + fileIdToVersions.put(wstat.getFileId(), new TreeSet()); + } + fileIdToVersions.get(wstat.getFileId()).add(entry.getKey()); + } + } + + Map> fileVersions = metadata.getAllVersionsInPartition(fs, partitionPath); + for (Map.Entry> entry : fileVersions.entrySet()) { + List versions = entry.getValue(); + // No file has no more than max versions + assertTrue("fileId " + entry.getKey() + " has more than " + maxVersions + " versions", + versions.size() <= maxVersions); + + // Each file, has the latest N versions (i.e cleaning gets rid of older versions) + List commitedVersions = new ArrayList<>(fileIdToVersions.get(entry.getKey())); + for (int i = 0; i < versions.size(); i++) { + assertEquals("File " + entry.getKey() + " does not have latest versions" + versions + " on commits" + commitedVersions, + FSUtils.getCommitTime(Iterables.get(versions, i).getPath().getName()), + commitedVersions.get(commitedVersions.size() - 1 - i)); + } + } + } + } + } + + @Test + public void testInsertAndCleanByCommits() throws Exception { + int maxCommits = 3; // keep upto 3 commits from the past + HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig( + HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaner.CleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainCommits(maxCommits).build()).build(); + HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); + HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); + FileSystem fs = FSUtils.getFs(); + + /** + * do a big insert + * (this is basically same as insert part of upsert, just adding it here so we can + * catch breakages in insert(), if the implementation diverges.) + */ + String newCommitTime = client.startCommit(); + List records = dataGen.generateInserts(newCommitTime, 500); + JavaRDD writeRecords = jsc.parallelize(records, 5); + + List statuses = client.insert(writeRecords, newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + // verify that there is a commit + assertEquals("Expecting a single commit.", new HoodieReadClient(jsc, basePath).listCommitsSince("000").size(), 1); + // Should have 100 records in table (check using Index), all in locations marked at commit + List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), new HoodieTableMetadata(fs, basePath)).collect(); + checkTaggedRecords(taggedRecords, newCommitTime); + + // Keep doing some writes and clean inline. Make sure we have expected number of files remaining. + for (int writeCnt = 2; writeCnt < 10; writeCnt++) { + Thread.sleep(1100); // make sure commits are unique + newCommitTime = client.startCommit(); + records = dataGen.generateUpdates(newCommitTime, 100); + + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); + // Verify there are no errors + assertNoWriteErrors(statuses); + + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + HoodieCommits commits = metadata.getAllCommits(); + String earliestRetainedCommit = commits.lastCommit(maxCommits - 1); + Set acceptableCommits = new HashSet<>(commits.getCommitList()); + if (earliestRetainedCommit != null) { + acceptableCommits.removeAll(commits.findCommitsInRange("000", earliestRetainedCommit)); + acceptableCommits.add(earliestRetainedCommit); + } + + // Need to ensure the following + for (String partitionPath : dataGen.getPartitionPaths()) { + Map> fileVersions = metadata.getAllVersionsInPartition(fs, partitionPath); + for (Map.Entry> entry : fileVersions.entrySet()) { + Set commitTimes = new HashSet<>(entry.getValue().size()); + for(FileStatus value:entry.getValue()) { + commitTimes.add(FSUtils.getCommitTime(value.getPath().getName())); + } + assertEquals("Only contain acceptable versions of file should be present", + acceptableCommits, commitTimes); + } + } + } + } + + @Test + public void testRollbackCommit() throws Exception { + // Let's create some commit files and parquet files + String commitTime1 = "20160501010101"; + String commitTime2 = "20160502020601"; + String commitTime3 = "20160506030611"; + new File(basePath + "/.hoodie").mkdirs(); + + // Only first two have commit files + HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2); + // Third one has a .inflight intermediate commit file + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3); + + // Make commit1 + String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11"); + String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12"); + String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13"); + + // Make commit2 + String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); + String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); + String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); + + // Make commit3 + String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31"); + String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); + String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); + + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY) + .build()).build(); + + HoodieWriteClient client = new HoodieWriteClient(jsc, config, false); + + // Rollback commit 1 (this should fail, since commit2 is still around) + try { + client.rollback(commitTime1); + assertTrue("Should have thrown an exception ", false); + } catch (HoodieRollbackException hrbe) { + // should get here + } + + // Rollback commit3 + client.rollback(commitTime3); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); + + // simulate partial failure, where .inflight was not deleted, but data files were. + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime3); + client.rollback(commitTime3); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + + + // Rollback commit2 + client.rollback(commitTime2); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + + // simulate partial failure, where only .commit => .inflight renaming succeeded, leaving a + // .inflight commit and a bunch of data files around. + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2); + file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); + file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); + file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); + + client.rollback(commitTime2); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + + + // Let's rollback commit1, Check results + client.rollback(commitTime1); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime1)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); + } + + + @Test + public void testAutoRollbackCommit() throws Exception { + // Let's create some commit files and parquet files + String commitTime1 = "20160501010101"; + String commitTime2 = "20160502020601"; + String commitTime3 = "20160506030611"; + new File(basePath + "/.hoodie").mkdirs(); + + // One good commit + HoodieTestUtils.createCommitFiles(basePath, commitTime1); + // Two inflight commits + HoodieTestUtils.createInflightCommitFiles(basePath, commitTime2, commitTime3); + + // Make commit1 + String file11 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime1, "id11"); + String file12 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime1, "id12"); + String file13 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime1, "id13"); + + // Make commit2 + String file21 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime2, "id21"); + String file22 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime2, "id22"); + String file23 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime2, "id23"); + + // Make commit3 + String file31 = HoodieTestUtils.createDataFile(basePath, "2016/05/01", commitTime3, "id31"); + String file32 = HoodieTestUtils.createDataFile(basePath, "2016/05/02", commitTime3, "id32"); + String file33 = HoodieTestUtils.createDataFile(basePath, "2016/05/06", commitTime3, "id33"); + + // Turn auto rollback off + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY) + .build()).build(); + + new HoodieWriteClient(jsc, config, false); + + // Check results, nothing changed + assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); + assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertTrue(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); + + // Turn auto rollback on + new HoodieWriteClient(jsc, config, true); + assertTrue(HoodieTestUtils.doesCommitExist(basePath, commitTime1)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime2)); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, commitTime3)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime3, file31) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime3, file32) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime3, file33)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime2, file21) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime2, file22) || + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime2, file23)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2016/05/01", commitTime1, file11) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/02", commitTime1, file12) && + HoodieTestUtils.doesDataFileExist(basePath, "2016/05/06", commitTime1, file13)); + } + + @Test + public void testSmallInsertHandling() throws Exception { + + HoodieWriteConfig.Builder builder = getConfigBuilder(); + FileSystem fs = FSUtils.getFs(); + + + final String TEST_PARTITION_PATH = "2016/09/26"; + final int INSERT_SPLIT_LIMIT = 10; + // based on examination of sample file, the schema produces the following per record size + final int SIZE_PER_RECORD = 50 * 1024; + // setup the small file handling params + HoodieWriteConfig config = builder.withCompactionConfig( + HoodieCompactionConfig.newBuilder().compactionSmallFileSize(SIZE_PER_RECORD * 15) + .insertSplitSize(INSERT_SPLIT_LIMIT).build()) // tolerate upto 15 records + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(SIZE_PER_RECORD * 20) + .build()).build(); // hold upto 20 records max + dataGen = new HoodieTestDataGenerator(new String[] {TEST_PARTITION_PATH}); + + HoodieWriteClient client = new HoodieWriteClient(jsc, config); + + // Inserts => will write file1 + String commitTime1 = "001"; + List inserts1 = dataGen.generateInserts(commitTime1, INSERT_SPLIT_LIMIT); // this writes ~500kb + Set keys1 = HoodieClientTestUtils.getRecordKeys(inserts1); + + JavaRDD insertRecordsRDD1 = jsc.parallelize(inserts1, 1); + List statuses= client.upsert(insertRecordsRDD1, commitTime1).collect(); + + assertNoWriteErrors(statuses); + + assertEquals("Just 1 file needs to be added.", 1, statuses.size()); + String file1 = statuses.get(0).getFileId(); + assertEquals("file should contain 10 records", + ParquetUtils.readRowKeysFromParquet(new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), + 10); + + // Update + Inserts such that they just expand file1 + String commitTime2 = "002"; + List inserts2 = dataGen.generateInserts(commitTime2, 4); + Set keys2 = HoodieClientTestUtils.getRecordKeys(inserts2); + List insertsAndUpdates2 = new ArrayList<>(); + insertsAndUpdates2.addAll(inserts2); + insertsAndUpdates2.addAll(dataGen.generateUpdates(commitTime2, inserts1)); + + JavaRDD insertAndUpdatesRDD2 = jsc.parallelize(insertsAndUpdates2, 1); + statuses = client.upsert(insertAndUpdatesRDD2, commitTime2).collect(); + assertNoWriteErrors(statuses); + + assertEquals("Just 1 file needs to be updated.", 1, statuses.size()); + assertEquals("Existing file should be expanded", file1, statuses.get(0).getFileId()); + assertEquals("Existing file should be expanded", commitTime1, statuses.get(0).getStat().getPrevCommit()); + Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); + assertEquals("file should contain 14 records", ParquetUtils.readRowKeysFromParquet(newFile).size(), 14); + + List records = ParquetUtils.readAvroRecords(newFile); + for (GenericRecord record: records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals("only expect commit2", commitTime2, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); + assertTrue("key expected to be part of commit2", keys2.contains(recordKey) || keys1.contains(recordKey)); + } + + // update + inserts such that file1 is updated and expanded, a new file2 is created. + String commitTime3 = "003"; + List insertsAndUpdates3 = dataGen.generateInserts(commitTime3, 20); + Set keys3 = HoodieClientTestUtils.getRecordKeys(insertsAndUpdates3); + List updates3 = dataGen.generateUpdates(commitTime3, inserts2); + insertsAndUpdates3.addAll(updates3); + + JavaRDD insertAndUpdatesRDD3 = jsc.parallelize(insertsAndUpdates3, 1); + statuses = client.upsert(insertAndUpdatesRDD3, commitTime3).collect(); + assertNoWriteErrors(statuses); + + assertEquals("2 files needs to be committed.", 2, statuses.size()); + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + FileStatus[] files = metadata.getLatestVersionInPartition(fs, TEST_PARTITION_PATH, commitTime3); + int numTotalInsertsInCommit3 = 0; + for (FileStatus file: files) { + if (file.getPath().getName().contains(file1)) { + assertEquals("Existing file should be expanded", commitTime3, FSUtils.getCommitTime(file.getPath().getName())); + records = ParquetUtils.readAvroRecords(file.getPath()); + for (GenericRecord record: records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); + if (recordCommitTime.equals(commitTime3)) { + if (keys2.contains(recordKey)) { + assertEquals("only expect commit3", commitTime3, recordCommitTime); + keys2.remove(recordKey); + } else { + numTotalInsertsInCommit3++; + } + } + } + assertEquals("All keys added in commit 2 must be updated in commit3 correctly", 0, keys2.size()); + } else { + assertEquals("New file must be written for commit 3", commitTime3, FSUtils.getCommitTime(file.getPath().getName())); + records = ParquetUtils.readAvroRecords(file.getPath()); + for (GenericRecord record: records) { + String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); + assertEquals("only expect commit3", commitTime3, record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()); + assertTrue("key expected to be part of commit3", keys3.contains(recordKey)); + } + numTotalInsertsInCommit3 += records.size(); + } + } + assertEquals("Total inserts in commit3 must add up", keys3.size(), numTotalInsertsInCommit3); + } + + + @After + public void clean() { + if (basePath != null) { + new File(basePath).delete(); + } + if (jsc != null) { + jsc.stop(); + } + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java new file mode 100644 index 000000000..63095f592 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common; + +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +/** + * Utility methods to aid testing inside the HoodieClient module. + */ +public class HoodieClientTestUtils { + + + public static List collectStatuses(Iterator> statusListItr) { + List statuses = new ArrayList<>(); + while (statusListItr.hasNext()) { + statuses.addAll(statusListItr.next()); + } + return statuses; + } + + public static Set getRecordKeys(List hoodieRecords) { + Set keys = new HashSet<>(); + for (HoodieRecord rec: hoodieRecords) { + keys.add(rec.getRecordKey()); + } + return keys; + } + + private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException { + String parentPath = basePath + "/"+ HoodieTableMetadata.METAFOLDER_NAME; + new File(parentPath).mkdirs(); + new File(parentPath + "/" + commitTime + suffix).createNewFile(); + } + + + public static void fakeCommitFile(String basePath, String commitTime) throws IOException { + fakeMetaFile(basePath, commitTime, HoodieTableMetadata.COMMIT_FILE_SUFFIX); + } + + public static void fakeInFlightFile(String basePath, String commitTime) throws IOException { + fakeMetaFile(basePath, commitTime, HoodieTableMetadata.INFLIGHT_FILE_SUFFIX); + } + + public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception { + fakeDataFile(basePath, partitionPath, commitTime, fileId, 0); + } + + public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception { + String parentPath = String.format("%s/%s", basePath, partitionPath); + new File(parentPath).mkdirs(); + String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId)); + new File(path).createNewFile(); + new RandomAccessFile(path, "rw").setLength(length); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java new file mode 100644 index 000000000..301e10e99 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common; + +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.HoodieAvroUtils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.UUID; + +/** + * Class to be used in tests to keep generating test inserts & updates against a corpus. + * + * Test data uses a toy Uber trips, data model. + */ +public class HoodieTestDataGenerator { + + + private static Logger logger = LogManager.getLogger(HoodieTestDataGenerator.class); + + static class KeyPartition { + HoodieKey key; + String partitionPath; + } + + public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + + "\"name\": \"triprec\"," + + "\"fields\": [ " + + "{\"name\": \"_row_key\", \"type\": \"string\"}," + + "{\"name\": \"rider\", \"type\": \"string\"}," + + "{\"name\": \"driver\", \"type\": \"string\"}," + + "{\"name\": \"begin_lat\", \"type\": \"double\"}," + + "{\"name\": \"begin_lon\", \"type\": \"double\"}," + + "{\"name\": \"end_lat\", \"type\": \"double\"}," + + "{\"name\": \"end_lon\", \"type\": \"double\"}," + + "{\"name\":\"fare\",\"type\": \"double\"}]}"; + + + private List existingKeysList = new ArrayList<>(); + private static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA)); + private static Random rand = new Random(46474747); + private String[] partitionPaths = {"2016/03/15", "2015/03/16", "2015/03/17"}; + + public HoodieTestDataGenerator(String[] partitionPaths) { + this.partitionPaths = partitionPaths; + } + + public HoodieTestDataGenerator() { + this(new String[]{"2016/03/15", "2015/03/16", "2015/03/17"}); + } + + + /** + * Generates new inserts, uniformly across the partition paths above. It also updates the list + * of existing keys. + */ + public List generateInserts(String commitTime, int n) throws IOException { + List inserts = new ArrayList<>(); + for (int i = 0; i < n; i++) { + String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)]; + HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); + HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime)); + inserts.add(record); + + KeyPartition kp = new KeyPartition(); + kp.key = key; + kp.partitionPath = partitionPath; + existingKeysList.add(kp); + + logger.info(" GENERATING INSERT FOR :" + key + "," + record.getPartitionPath()); + } + return inserts; + } + + + public List generateUpdates(String commitTime, List baseRecords) throws IOException { + List updates = new ArrayList<>(); + for (HoodieRecord baseRecord: baseRecords) { + HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime)); + updates.add(record); + logger.info(" GENERATING UPDATE FOR :" + baseRecord.getKey()); + } + return updates; + } + + /** + * Generates new updates, randomly distributed across the keys above. + */ + public List generateUpdates(String commitTime, int n) throws IOException { + List updates = new ArrayList<>(); + for (int i = 0; i < n; i++) { + KeyPartition kp = existingKeysList.get(rand.nextInt(existingKeysList.size() - 1)); + HoodieRecord record = new HoodieRecord(kp.key, generateRandomValue(kp.key, commitTime)); + updates.add(record); + logger.info(" GENERATING UPDATE FOR :" + kp.key); + } + return updates; + } + + + /** + * Generates a new avro record of the above schema format, retaining the key if optionally + * provided. + */ + public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException { + GenericRecord rec = new GenericData.Record(avroSchema); + rec.put("_row_key", key.getRecordKey()); + rec.put("rider", "rider-" + commitTime); + rec.put("driver", "driver-" + commitTime); + rec.put("begin_lat", rand.nextDouble()); + rec.put("begin_lon", rand.nextDouble()); + rec.put("end_lat", rand.nextDouble()); + rec.put("end_lon", rand.nextDouble()); + rec.put("fare", rand.nextDouble() * 100); + HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1"); + return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA); + } + + public static void createCommitFile(String basePath, String commitTime) throws IOException { + Path commitFile = + new Path(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + FSUtils.makeCommitFileName(commitTime)); + FileSystem fs = FSUtils.getFs(); + FSDataOutputStream os = fs.create(commitFile, true); + HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); + try { + // Write empty commit metadata + os.writeBytes(new String(commitMetadata.toJsonString().getBytes( + StandardCharsets.UTF_8))); + } finally { + os.close(); + } + + } + + public String[] getPartitionPaths() { + return partitionPaths; + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java b/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java new file mode 100644 index 000000000..6d8182275 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/TestRawTripPayload.java @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import com.uber.hoodie.avro.MercifulJsonConverter; +import com.uber.hoodie.common.model.HoodieRecordPayload; + +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.commons.io.IOUtils; + +import java.io.*; +import java.util.Map; +import java.util.zip.Deflater; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; + +/** + * Example row change event based on some example data used by testcases. The data avro schema is + * src/test/resources/schema1. + */ +public class TestRawTripPayload implements HoodieRecordPayload { + private transient static final ObjectMapper mapper = new ObjectMapper(); + private String partitionPath; + private String rowKey; + private byte[] jsonDataCompressed; + private int dataSize; + + public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, String schemaStr) throws IOException { + this.jsonDataCompressed = compressData(jsonData); + this.dataSize = jsonData.length(); + this.rowKey = rowKey; + this.partitionPath = partitionPath; + } + + public TestRawTripPayload(String jsonData) throws IOException { + this.jsonDataCompressed = compressData(jsonData); + this.dataSize = jsonData.length(); + Map jsonRecordMap = mapper.readValue(jsonData, Map.class); + this.rowKey = jsonRecordMap.get("_row_key").toString(); + this.partitionPath = jsonRecordMap.get("time").toString().split("T")[0].replace("-", "/"); + } + + public String getPartitionPath() { + return partitionPath; + } + + + @Override public TestRawTripPayload preCombine(TestRawTripPayload another) { + return another; + } + + @Override public IndexedRecord combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { + return this.getInsertValue(schema); + } + + @Override public IndexedRecord getInsertValue(Schema schema) throws IOException { + MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); + return jsonConverter.convert(getJsonData()); + } + + public String getRowKey() { + return rowKey; + } + + public String getJsonData() throws IOException { + return unCompressData(jsonDataCompressed); + } + + + private byte[] compressData(String jsonData) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + DeflaterOutputStream dos = + new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true); + try { + dos.write(jsonData.getBytes()); + } finally { + dos.flush(); + dos.close(); + } + return baos.toByteArray(); + } + + + private String unCompressData(byte[] data) throws IOException { + InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data)); + StringWriter sw = new StringWriter(dataSize); + IOUtils.copy(iis, sw); + return sw.toString(); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java new file mode 100644 index 000000000..a954759c5 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.func; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.TestRawTripPayload; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.table.HoodieCopyOnWriteTable; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.Path; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import static org.junit.Assert.fail; + +public class TestUpdateMapFunction { + private String basePath = null; + + @Before + public void init() throws Exception { + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.initializeHoodieDirectory(basePath); + } + + @Test + public void testSchemaEvolutionOnUpdate() throws Exception { + // Create a bunch of records with a old version of schema + HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt"); + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable("100", config, metadata); + + String recordStr1 = + "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = + "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = + "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + List records = new ArrayList<>(); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + records.add( + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), + rowChange1)); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + records.add( + new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), + rowChange2)); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + records.add( + new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), + rowChange3)); + Iterator> insertResult = table.handleInsert(records.iterator()); + Path commitFile = + new Path(config.getBasePath() + "/.hoodie/" + FSUtils.makeCommitFileName("100")); + FSUtils.getFs().create(commitFile); + + // Now try an update with an evolved schema + // Evolved schema does not have guarantee on preserving the original field ordering + config = makeHoodieClientConfig("/exampleEvolvedSchema.txt"); + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath); + String fileId = insertResult.next().get(0).getFileId(); + System.out.println(fileId); + + + table = new HoodieCopyOnWriteTable("101", config, metadata); + // New content with values for the newly added field + recordStr1 = + "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}"; + records = new ArrayList<>(); + rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = + new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), + rowChange1); + record1.setCurrentLocation(new HoodieRecordLocation("100", fileId)); + records.add(record1); + + try { + table.handleUpdate(fileId, records.iterator()); + } catch (ClassCastException e) { + fail( + "UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt"); + } + } + + private HoodieWriteConfig makeHoodieClientConfig(String schema) throws Exception { + // Prepare the AvroParquetIO + String schemaStr = IOUtils.toString(getClass().getResourceAsStream(schema), "UTF-8"); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr).build(); + } + +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java new file mode 100644 index 000000000..24091fa71 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieBloomIndex.java @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.google.common.base.Optional; +import com.google.common.collect.Lists; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.avro.HoodieAvroWriteSupport; +import com.uber.hoodie.common.BloomFilter; +import com.uber.hoodie.common.TestRawTripPayload; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.HoodieAvroUtils; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import scala.Tuple2; + +import java.io.File; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.*; + +import static org.junit.Assert.*; + +public class TestHoodieBloomIndex { + private JavaSparkContext jsc = null; + private String basePath = null; + private transient final FileSystem fs; + + public TestHoodieBloomIndex() throws Exception { + fs = FSUtils.getFs(); + } + + @Before + public void init() throws IOException { + // Initialize a local spark env + SparkConf sparkConf = new SparkConf().setAppName("TestHoodieBloomIndex").setMaster("local[4]"); + jsc = new JavaSparkContext(sparkConf); + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.initializeHoodieDirectory(basePath); + } + + @Test + public void testLoadUUIDsInMemory() throws IOException { + // Create one RDD of hoodie record + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + + String schemaStr = + IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + + // Load to memory + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); + Map> map = index.getPartitionToRowKeys(recordRDD); + assertEquals(map.size(), 2); + List list1 = Lists.newArrayList(map.get("2016/01/31")); + List list2 = Lists.newArrayList(map.get("2015/01/31")); + assertEquals(list1.size(), 3); + assertEquals(list2.size(), 1); + } + + @Test + public void testLoadInvolvedFiles() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); + + // Create some partitions, and put some files + // "2016/01/21": 0 file + // "2016/04/01": 1 file (2_0_20160401010101.parquet) + // "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet) + new File(basePath + "/2016/01/21").mkdirs(); + new File(basePath + "/2016/04/01").mkdirs(); + new File(basePath + "/2015/03/12").mkdirs(); + new File(basePath + "/2016/04/01/2_0_20160401010101.parquet").createNewFile(); + new File(basePath + "/2015/03/12/1_0_20150312101010.parquet").createNewFile(); + new File(basePath + "/2015/03/12/3_0_20150312101010.parquet").createNewFile(); + new File(basePath + "/2015/03/12/4_0_20150312101010.parquet").createNewFile(); + List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + JavaPairRDD rdd = index.loadInvolvedFiles(partitions, metadata); + // Still 0, as no valid commit + assertEquals(rdd.count(), 0); + + // Add some commits + new File(basePath + "/.hoodie").mkdirs(); + new File(basePath + "/.hoodie/20160401010101.commit").createNewFile(); + new File(basePath + "/.hoodie/20150312101010.commit").createNewFile(); + metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + rdd = index.loadInvolvedFiles(partitions, metadata); + final List> filesList = rdd.collect(); + assertEquals(filesList.size(), 4); + + // no longer sorted, but should have same files. + Set actualFiles = new HashSet(){{ + add(filesList.get(0)._1 + "/" + filesList.get(0)._2); + add(filesList.get(1)._1 + "/" + filesList.get(1)._2); + add(filesList.get(2)._1 + "/" + filesList.get(2)._2); + add(filesList.get(3)._1 + "/" + filesList.get(3)._2); + }}; + + Set expected = new HashSet() {{ + add("2016/04/01/2_0_20160401010101.parquet"); + add("2015/03/12/1_0_20150312101010.parquet"); + add("2015/03/12/3_0_20150312101010.parquet"); + add("2015/03/12/4_0_20150312101010.parquet"); + }}; + assertEquals(expected, actualFiles); + } + + @Test + public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieBloomIndex index = new HoodieBloomIndex(config, jsc); + + String schemaStr = + IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); + // Create some records to use + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}"; + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + + // We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3). + BloomFilter filter = new BloomFilter(10000, 0.0000001); + filter.add(record3.getRecordKey()); + String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true); + + // The bloom filter contains 3 records + assertTrue(filter.mightContain(record1.getRecordKey())); + assertTrue(filter.mightContain(record2.getRecordKey())); + assertTrue(filter.mightContain(record3.getRecordKey())); + assertFalse(filter.mightContain(record4.getRecordKey())); + + // Compare with file + List uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), + record3.getRecordKey(), record4.getRecordKey()); + + List results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids, + new Path(basePath + "/2016/01/31/" + filename)); + assertEquals(results.size(), 2); + assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")); + assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") + || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")); + // TODO(vc): Need more coverage on actual filenames + //assertTrue(results.get(0)._2().equals(filename)); + //assertTrue(results.get(1)._2().equals(filename)); + } + + @Test + public void testTagLocationWithEmptyRDD() throws Exception { + // We have some records to be tagged (two different partitions) + JavaRDD recordRDD = jsc.emptyRDD(); + // Also create the metadata and config + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + + try { + bloomIndex.tagLocation(recordRDD, metadata); + } catch (IllegalArgumentException e) { + fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required"); + } + } + + + @Test + public void testTagLocation() throws Exception { + // We have some records to be tagged (two different partitions) + String schemaStr = + IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); + + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4)); + + // Also create the metadata and config + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, metadata); + + // Should not find any files + for (HoodieRecord record : taggedRecordRDD.collect()) { + assertTrue(!record.isCurrentLocationKnown()); + } + + // We create three parquet file, each having one record. (two different partitions) + String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true); + String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true); + String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); + + // We do the tag again + metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + taggedRecordRDD = bloomIndex.tagLocation(recordRDD, metadata); + + // Check results + for (HoodieRecord record : taggedRecordRDD.collect()) { + if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1))); + } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2))); + } else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(!record.isCurrentLocationKnown()); + } else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3))); + } + } + } + + @Test + public void testCheckExists() throws Exception { + // We have some records to be tagged (two different partitions) + String schemaStr = + IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); + + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}"; + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()); + HoodieRecord record1 = new HoodieRecord(key1, rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()); + HoodieRecord record2 = new HoodieRecord(key2, rowChange2); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()); + HoodieRecord record3 = new HoodieRecord(key3, rowChange3); + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()); + HoodieRecord record4 = new HoodieRecord(key4, rowChange4); + JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); + + // Also create the metadata and config + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + + // Let's tag + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + JavaPairRDD> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, metadata); + + // Should not find any files + for (Tuple2> record : taggedRecordRDD.collect()) { + assertTrue(!record._2.isPresent()); + } + + // We create three parquet file, each having one record. (two different partitions) + String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true); + String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true); + String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); + + // We do the tag again + metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, metadata); + + // Check results + for (Tuple2> record : taggedRecordRDD.collect()) { + if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record._2.isPresent()); + Path path1 = new Path(record._2.get()); + assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName())); + } else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertTrue(record._2.isPresent()); + Path path2 = new Path(record._2.get()); + assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName())); + } else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(!record._2.isPresent()); + } else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) { + assertTrue(record._2.isPresent()); + Path path3 = new Path(record._2.get()); + assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName())); + } + } + } + + + @Test + public void testBloomFilterFalseError() throws IOException, InterruptedException { + // We have two hoodie records + String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + + // We write record1 to a parquet file, using a bloom filter having both records + String schemaStr = + IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr)); + + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2); + + BloomFilter filter = new BloomFilter(10000, 0.0000001); + filter.add(record2.getRecordKey()); + String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true); + assertTrue(filter.mightContain(record1.getRecordKey())); + assertTrue(filter.mightContain(record2.getRecordKey())); + + // We do the tag + JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2)); + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); + HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc); + JavaRDD taggedRecordRDD = bloomIndex.tagLocation(recordRDD, metadata); + + // Check results + for (HoodieRecord record : taggedRecordRDD.collect()) { + if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) { + assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename))); + } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) { + assertFalse(record.isCurrentLocationKnown()); + } + } + } + + private String writeParquetFile(String partitionPath, List records, Schema schema, + BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException { + Thread.sleep(1000); + String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); + String fileId = UUID.randomUUID().toString(); + String filename = FSUtils.makeDataFileName(commitTime, 1, fileId); + + return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime); + } + + private String writeParquetFile(String partitionPath, String filename, List records, Schema schema, + BloomFilter filter, boolean createCommitTime) throws IOException { + if (filter == null) { + filter = new BloomFilter(10000, 0.0000001); + } + HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); + ParquetWriter writer = new ParquetWriter(new Path(basePath + "/" + partitionPath + "/" + filename), + writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); + int seqId = 1; + String commitTime = FSUtils.getCommitTime(filename); + for (HoodieRecord record : records) { + GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema); + HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++); + HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename); + writer.write(avroRecord); + filter.add(record.getRecordKey()); + } + writer.close(); + + if (createCommitTime) { + // Also make sure the commit is valid + new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME).mkdirs(); + new File(basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile(); + } + return filename; + } + + @After + public void clean() { + if (jsc != null) { + jsc.stop(); + } + if (basePath != null) { + new File(basePath).delete(); + } + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java new file mode 100644 index 000000000..07ebff5ad --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/index/TestHoodieIndex.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.index; + +import com.uber.hoodie.config.HoodieWriteConfig; + +import com.uber.hoodie.config.HoodieIndexConfig; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class TestHoodieIndex { + @Test + public void testCreateIndex() throws Exception { + HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder(); + HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder(); + // Different types + HoodieWriteConfig config = clientConfigBuilder.withPath("") + .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()) + .build(); + assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex); + config = clientConfigBuilder.withPath("").withIndexConfig( + indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build(); + assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex); + config = clientConfigBuilder.withPath("") + .withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java new file mode 100644 index 000000000..4b5e6b629 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCleaner.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.util.FSUtils; + +import com.uber.hoodie.config.HoodieCompactionConfig; +import org.junit.Before; +import org.junit.Test; +import java.io.IOException; +import static org.junit.Assert.*; + +/** + * Tests around Cleaning logic in Hoodie + */ +public class TestHoodieCleaner { + + private String basePath = null; + private String[] partitionPaths = {"2016/01/01", "2016/02/02"}; + + @Before + public void init() throws Exception { + this.basePath = HoodieTestUtils.initializeTempHoodieBasePath(); + } + + @Test + public void testKeepLatestFileVersions() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaner.CleaningPolicy.KEEP_LATEST_FILE_VERSIONS) + .retainFileVersions(1).build()).build(); + + // make 1 commit, with 1 file per partition + HoodieTestUtils.createCommitFiles(basePath, "000"); + + String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); + String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); + + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieCleaner cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[1])); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + HoodieTestUtils.createCommitFiles(basePath, "001"); + + String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert + String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals("Must clean 1 file" , 1, cleaner.clean(partitionPaths[0])); + assertEquals("Must clean 1 file" , 1, cleaner.clean(partitionPaths[1])); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + HoodieTestUtils.createCommitFiles(basePath, "002"); + + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update + String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals("Must clean two files" , 2, cleaner.clean(partitionPaths[0])); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + + // No cleaning on partially written file, with no commit. + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file3P0C2); // update + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + } + + + @Test + public void testKeepLatestCommits() throws IOException { + HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath) + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .withCleanerPolicy(HoodieCleaner.CleaningPolicy.KEEP_LATEST_COMMITS) + .retainCommits(2).build()).build(); + + + // make 1 commit, with 1 file per partition + HoodieTestUtils.createCommitFiles(basePath, "000"); + + String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); + String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); + + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieCleaner cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[1])); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 1 insert & 1 update per partition + HoodieTestUtils.createCommitFiles(basePath, "001"); + + String file2P0C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "001"); // insert + String file2P1C1 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "001"); // insert + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "001", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[1], "001", file1P1C0); // update + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[1])); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "001", file2P1C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[1], "000", file1P1C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + HoodieTestUtils.createCommitFiles(basePath, "002"); + + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file2P0C1); // update + String file3P0C2 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "002"); + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals( + "Must not clean any file. We have to keep 1 version before the latest commit time to keep", + 0, cleaner.clean(partitionPaths[0])); + + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + + // make next commit, with 2 updates to existing files, and 1 insert + HoodieTestUtils.createCommitFiles(basePath, "003"); + + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file2P0C1); // update + String file4P0C3 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "003"); + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + cleaner = new HoodieCleaner(metadata, config, FSUtils.getFs()); + assertEquals( + "Must not clean one old file", 1, cleaner.clean(partitionPaths[0])); + + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "000", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file2P0C1)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "002", file3P0C2)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "003", file4P0C3)); + + // No cleaning on partially written file, with no commit. + HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "004", file3P0C2); // update + assertEquals("Must not clean any files" , 0, cleaner.clean(partitionPaths[0])); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file1P0C0)); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, partitionPaths[0], "001", file2P0C1)); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java new file mode 100644 index 000000000..d9f785eda --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.io; + +import com.google.common.collect.Lists; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.common.HoodieTestDataGenerator; +import com.uber.hoodie.common.model.HoodieCommitMetadata; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.config.HoodieCompactionConfig; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.util.SortedMap; +import java.util.TreeMap; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestHoodieCommitArchiveLog { + private String basePath; + private FileSystem fs; + + @Before + public void init() throws Exception { + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.initializeHoodieDirectory(basePath); + fs = FSUtils.getFs(); + } + + @Test + public void testArchiveEmptyDataset() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").build(); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + } + + @Test + public void testArchiveDatasetWithNoArchival() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").withCompactionConfig( + HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); + + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + assertEquals("Loaded 4 commits and the count should match", 4, + metadata.getAllCommits().getCommitList().size()); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + metadata = new HoodieTableMetadata(fs, basePath); + assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, + metadata.getAllCommits().getCommitList().size()); + } + + @Test + public void testArchiveDatasetWithArchival() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").withCompactionConfig( + HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); + HoodieTestDataGenerator.createCommitFile(basePath, "104"); + HoodieTestDataGenerator.createCommitFile(basePath, "105"); + + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + SortedMap originalCommits = new TreeMap<>(metadata.getAllCommitMetadata()); + + assertEquals("Loaded 6 commits and the count should match", 6, + metadata.getAllCommits().getCommitList().size()); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + metadata = new HoodieTableMetadata(fs, basePath); + assertEquals( + "Should archive commits when maxCommitsToKeep is 5 and now the commits length should be minCommitsToKeep which is 2", + 2, metadata.getAllCommits().getCommitList().size()); + assertEquals("Archive should not archive the last 2 commits", + Lists.newArrayList("104", "105"), metadata.getAllCommits().getCommitList()); + + // Remove all the commits from the original commits, make it ready to be checked against the read map + for(String key:metadata.getAllCommitMetadata().keySet()) { + originalCommits.remove(key); + } + + // Read back the commits to make sure + SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), + SequenceFile.Reader.file(archiveLog.getArchiveFilePath())); + Text key = new Text(); + Text val = new Text(); + SortedMap readCommits = new TreeMap<>(); + while (reader.next(key, val)) { + HoodieCommitMetadata meta = HoodieCommitMetadata.fromJsonString(val.toString()); + readCommits.put(key.toString(), meta); + } + + assertEquals( + "Read commits map should match the originalCommits - commitsLoadedAfterArchival", + originalCommits, readCommits); + reader.close(); + } + + @Test + public void testArchiveCommitSafety() throws IOException { + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable("test-trip-table").withCompactionConfig( + HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); + HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg); + HoodieTestDataGenerator.createCommitFile(basePath, "100"); + HoodieTestDataGenerator.createCommitFile(basePath, "101"); + HoodieTestDataGenerator.createCommitFile(basePath, "102"); + HoodieTestDataGenerator.createCommitFile(basePath, "103"); + HoodieTestDataGenerator.createCommitFile(basePath, "104"); + HoodieTestDataGenerator.createCommitFile(basePath, "105"); + + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath); + assertEquals("Loaded 6 commits and the count should match", 6, + metadata.getAllCommits().getCommitList().size()); + boolean result = archiveLog.archiveIfRequired(); + assertTrue(result); + metadata = new HoodieTableMetadata(fs, basePath); + assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("100")); + assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("101")); + assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("102")); + assertTrue("Archived commits should always be safe", metadata.isCommitTsSafe("103")); + } + + + +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java new file mode 100644 index 000000000..7e33ad579 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/metrics/TestHoodieMetrics.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.metrics; + +import com.uber.hoodie.config.HoodieWriteConfig; + +import org.apache.commons.configuration.ConfigurationException; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestHoodieMetrics { + private HoodieMetrics metrics = null; + + @Before + public void start() throws ConfigurationException { + HoodieWriteConfig config = mock(HoodieWriteConfig.class); + when(config.isMetricsOn()).thenReturn(true); + when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.INMEMORY); + metrics = new HoodieMetrics(config, "raw_table"); + } + + @Test + public void testRegisterGauge() { + metrics.registerGauge("metric1", 123L); + assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123")); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java new file mode 100644 index 000000000..1b389c875 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.table; + +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.WriteStatus; +import com.uber.hoodie.common.BloomFilter; +import com.uber.hoodie.common.HoodieClientTestUtils; +import com.uber.hoodie.common.HoodieTestDataGenerator; +import com.uber.hoodie.common.TestRawTripPayload; +import com.uber.hoodie.common.model.HoodieCommits; +import com.uber.hoodie.common.model.HoodieKey; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieRecordLocation; +import com.uber.hoodie.common.model.HoodieTableMetadata; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.util.ParquetUtils; + +import com.uber.hoodie.config.HoodieCompactionConfig; +import com.uber.hoodie.io.HoodieInsertHandle; +import com.uber.hoodie.config.HoodieStorageConfig; +import org.apache.avro.generic.GenericRecord; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.avro.AvroReadSupport; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.UUID; + +import scala.Option; +import scala.Tuple2; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TestCopyOnWriteTable { + private String basePath = null; + private transient JavaSparkContext jsc = null; + + @Before + public void init() throws Exception { + + // Initialize a local spark env + SparkConf sparkConf = new SparkConf().setAppName("TestCopyOnWriteTable").setMaster("local[4]"); + jsc = new JavaSparkContext(sparkConf); + + // Create a temp folder as the base path + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + this.basePath = folder.getRoot().getAbsolutePath(); + HoodieTestUtils.initializeHoodieDirectory(basePath); + } + + @Test + public void testMakeNewPath() throws Exception { + String fileName = UUID.randomUUID().toString(); + String partitionPath = "2016/05/04"; + int unitNumber = (int) (Math.random() * 10); + HoodieRecord record = mock(HoodieRecord.class); + when(record.getPartitionPath()).thenReturn(partitionPath); + + String commitTime = HoodieTestUtils.getNewCommitTime(); + HoodieWriteConfig config = makeHoodieClientConfig(); + HoodieInsertHandle io = new HoodieInsertHandle(config, commitTime, null, partitionPath); + Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName); + assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils + .makeDataFileName(commitTime, unitNumber, fileName))); + } + + private HoodieWriteConfig makeHoodieClientConfig() throws Exception { + return makeHoodieClientConfigBuilder().build(); + } + + private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception { + // Prepare the AvroParquetIO + String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); + return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr); + } + + // TODO (weiy): Add testcases for crossing file writing. + @Test + public void testUpdateRecords() throws Exception { + // Prepare the AvroParquetIO + HoodieWriteConfig config = makeHoodieClientConfig(); + String firstCommitTime = HoodieTestUtils.getNewCommitTime(); + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + String partitionPath = "/2016/01/31"; + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(firstCommitTime, config, metadata); + + // Get some records belong to the same partition (2016/01/31) + String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; + String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}"; + String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}"; + + List records = new ArrayList<>(); + TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1); + records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1)); + TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2); + records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2)); + TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3); + records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3)); + + // Insert new records + HoodieClientTestUtils.collectStatuses(table.handleInsert(records.iterator())); + // We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO) + File parquetFile = null; + for (File file : new File(this.basePath + partitionPath).listFiles()) { + if (file.getName().endsWith(".parquet")) { + parquetFile = file; + break; + } + } + assertTrue(parquetFile != null); + + // Read out the bloom filter and make sure filter can answer record exist or not + Path parquetFilePath = new Path(parquetFile.getAbsolutePath()); + BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath); + for (HoodieRecord record : records) { + assertTrue(filter.mightContain(record.getRecordKey())); + } + // Create a commit file + new File(this.basePath + "/" + HoodieTableMetadata.METAFOLDER_NAME + "/" + + FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); + + // Read the parquet file, check the record content + List fileRecords = ParquetUtils.readAvroRecords(parquetFilePath); + GenericRecord newRecord; + int index = 0; + for (GenericRecord record: fileRecords) { + assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey())); + index++; + } + + // We update the 1st record & add a new record + String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}"; + TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1); + HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1); + updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName()))); + + TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4); + HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4); + + List updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1); + + Thread.sleep(1000); + String newCommitTime = HoodieTestUtils.getNewCommitTime(); + metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + table = new HoodieCopyOnWriteTable(newCommitTime, config, metadata); + Iterator> iter = table.handleUpdate(updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator()); + + // Check the updated file + File updatedParquetFile = null; + for (File file : new File(basePath + "/2016/01/31").listFiles()) { + if (file.getName().endsWith(".parquet")) { + if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName())) + && HoodieCommits + .isCommit1After(FSUtils.getCommitTime(file.getName()), FSUtils.getCommitTime(parquetFile.getName()))) { + updatedParquetFile = file; + break; + } + } + } + assertTrue(updatedParquetFile != null); + // Check whether the record has been updated + Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); + BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath); + for (HoodieRecord record : records) { + // No change to the _row_key + assertTrue(updatedFilter.mightContain(record.getRecordKey())); + } + + assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey())); + records.add(insertedRecord1);// add this so it can further check below + + ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build(); + index = 0; + while ((newRecord = (GenericRecord) updatedReader.read()) != null) { + assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey())); + if (index == 0) { + assertTrue(newRecord.get("number").toString().equals("15")); + } + index++; + } + updatedReader.close(); + // Also check the numRecordsWritten + List statuses = HoodieClientTestUtils.collectStatuses(iter); + WriteStatus writeStatus = statuses.get(0); + assertTrue("Should be only one file generated", statuses.size() == 1); + assertEquals(4, writeStatus.getStat().getNumWrites());//3 rewritten records + 1 new record + } + + + private List newHoodieRecords(int n, String time) throws Exception { + List records = new ArrayList<>(); + for (int i = 0; i < n; i++) { + String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}", + UUID.randomUUID().toString(), + time, + i); + TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); + records.add(new HoodieRecord( + new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), + rowChange)); + } + return records; + } + + @Test public void testInsertWithPartialFailures() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfig(); + String commitTime = HoodieTestUtils.getNewCommitTime(); + FileSystem fs = FSUtils.getFs(); + HoodieTableMetadata metadata = new HoodieTableMetadata(fs, basePath, "testTable"); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(commitTime, config, metadata); + + // Write a few records, and get atleast one file + // 10 records for partition 1, 1 record for partition 2. + List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + // Simulate crash after first file + List statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(records.iterator())); + WriteStatus status = statuses.get(0); + Path partialFile = new Path(String.format("%s/%s/%s", + basePath, + status.getPartitionPath(), + FSUtils.makeDataFileName(commitTime, 0, status.getFileId())) + ); + assertTrue(fs.exists(partialFile)); + + // When we retry + records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(records.iterator())); + status = statuses.get(0); + + Path retriedFIle = new Path(String.format("%s/%s/%s", + basePath, + status.getPartitionPath(), + FSUtils.makeDataFileName(commitTime, 0, status.getFileId())) + ); + assertTrue(fs.exists(retriedFIle)); + assertFalse(fs.exists(partialFile)); + } + + + @Test public void testInsertRecords() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfig(); + String commitTime = HoodieTestUtils.getNewCommitTime(); + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(commitTime, config, metadata); + + // Case 1: + // 10 records for partition 1, 1 record for partition 2. + List records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z")); + + // Insert new records + List returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(records.iterator())); + + + // TODO: check the actual files and make sure 11 records, total were written. + assertEquals(2, returnedStatuses.size()); + assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); + assertEquals(0, returnedStatuses.get(0).getFailedRecords().size()); + assertEquals(10, returnedStatuses.get(0).getWrittenRecords().size()); + assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath()); + assertEquals(0, returnedStatuses.get(0).getFailedRecords().size()); + assertEquals(1, returnedStatuses.get(1).getWrittenRecords().size()); + + // Case 2: + // 1 record for partition 1, 5 record for partition 2, 1 records for partition 3. + records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z"); + records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z")); + records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z")); + + // Insert new records + returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(records.iterator())); + + assertEquals(3, returnedStatuses.size()); + assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath()); + assertEquals(1, returnedStatuses.get(0).getWrittenRecords().size()); + + assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath()); + assertEquals(5, returnedStatuses.get(1).getWrittenRecords().size()); + + assertEquals("2016/02/02", returnedStatuses.get(2).getPartitionPath()); + assertEquals(1, returnedStatuses.get(2).getWrittenRecords().size()); + + } + + @Test public void testFileSizeUpsertRecords() throws Exception { + HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig( + HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) + .parquetPageSize(64 * 1024).build()).build(); + String commitTime = HoodieTestUtils.getNewCommitTime(); + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(commitTime, config, metadata); + + List records = new ArrayList<>(); + // Approx 1150 records are written for block size of 64KB + for (int i = 0; i < 2000; i++) { + String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}"; + TestRawTripPayload rowChange = new TestRawTripPayload(recordStr); + records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), + rowChange)); + } + + // Insert new records + HoodieClientTestUtils.collectStatuses(table.handleInsert(records.iterator())); + + // Check the updated file + int counts = 0; + for (File file : new File(basePath + "/2016/01/31").listFiles()) { + if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) { + System.out.println(file.getName() + "-" + file.length()); + counts++; + } + } + assertEquals( + "If the number of records are more than 1150, then there should be a new file", 3, + counts); + } + + + + private List testUpsertPartitioner(int smallFileSize, + int numInserts, + int numUpdates, + int fileSize, + boolean autoSplitInserts) throws Exception { + final String TEST_PARTITION_PATH = "2016/09/26"; + HoodieWriteConfig config = makeHoodieClientConfigBuilder() + .withCompactionConfig(HoodieCompactionConfig.newBuilder() + .compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()) + .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); + + HoodieClientTestUtils.fakeCommitFile(basePath, "001"); + HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); + + HoodieTableMetadata metadata = new HoodieTableMetadata(FSUtils.getFs(), basePath, "testTable"); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable("001", config, metadata); + + HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH}); + List insertRecords = dataGenerator.generateInserts("001", numInserts); + List updateRecords = dataGenerator.generateUpdates("001", numUpdates); + for (HoodieRecord updateRec: updateRecords) { + updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1")); + } + List records = new ArrayList<>(); + records.addAll(insertRecords); + records.addAll(updateRecords); + WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records)); + HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner) + table.getUpsertPartitioner(profile); + + assertEquals("Should have 3 partitions", 3, partitioner.numPartitions()); + assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE, + partitioner.getBucketInfo(0).bucketType); + assertEquals("Bucket 1 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, + partitioner.getBucketInfo(1).bucketType); + assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT, + partitioner.getBucketInfo(2).bucketType); + assertEquals("Update record should have gone to the 1 update partiton", 0, + partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation())))); + return partitioner.getInsertBuckets(TEST_PARTITION_PATH); + } + + + @Test + public void testUpsertPartitioner() throws Exception { + // Inserts + Updates... Check all updates go together & inserts subsplit + List insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false); + assertEquals("Total of 2 insert buckets", 2, insertBuckets.size()); + } + + + @Test + public void testUpsertPartitionerWithSmallInsertHandling() throws Exception { + // Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file + List insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false); + assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); + assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber); + assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01); + + // Now with insert split size auto tuned + insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true); + assertEquals("Total of 3 insert buckets", 3, insertBuckets.size()); + assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber); + assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01); + } + + @After + public void cleanup() { + if (basePath != null) { + new File(basePath).delete(); + } + if (jsc != null) { + jsc.stop(); + } + } +} diff --git a/hoodie-client/src/test/resources/exampleEvolvedSchema.txt b/hoodie-client/src/test/resources/exampleEvolvedSchema.txt new file mode 100644 index 000000000..4edbb48c9 --- /dev/null +++ b/hoodie-client/src/test/resources/exampleEvolvedSchema.txt @@ -0,0 +1,23 @@ +{ + "namespace": "example.schema", + "type": "record", + "name": "trip", + "fields": [ + { + "name": "number", + "type": ["int", "null"] + }, + { + "name": "time", + "type": "string" + }, + { + "name": "_row_key", + "type": "string" + }, + { + "name": "added_field", + "type": ["int", "null"] + } + ] +} diff --git a/hoodie-client/src/test/resources/exampleSchema.txt b/hoodie-client/src/test/resources/exampleSchema.txt new file mode 100644 index 000000000..902eecaa7 --- /dev/null +++ b/hoodie-client/src/test/resources/exampleSchema.txt @@ -0,0 +1,19 @@ +{ + "namespace": "example.schema", + "type": "record", + "name": "trip", + "fields": [ + { + "name": "_row_key", + "type": "string" + }, + { + "name": "time", + "type": "string" + }, + { + "name": "number", + "type": ["int", "null"] + } + ] +} diff --git a/hoodie-client/src/test/resources/log4j.properties b/hoodie-client/src/test/resources/log4j.properties new file mode 100644 index 000000000..5a8b643fd --- /dev/null +++ b/hoodie-client/src/test/resources/log4j.properties @@ -0,0 +1,23 @@ +# +# Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/GenericHoodiePayload.java b/hoodie-common/src/main/java/com/uber/hoodie/common/GenericHoodiePayload.java new file mode 100644 index 000000000..6144f99d4 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/GenericHoodiePayload.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie.common; + +import com.uber.hoodie.avro.MercifulJsonConverter; +import com.uber.hoodie.common.model.HoodieRecordPayload; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.commons.io.IOUtils; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.StringWriter; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.zip.Deflater; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; + +public class GenericHoodiePayload implements HoodieRecordPayload { + private byte[] jsonDataCompressed; + private int dataSize; + + public GenericHoodiePayload(String json) throws IOException { + this.jsonDataCompressed = compressData(json); + this.dataSize = json.length(); + } + + @Override public GenericHoodiePayload preCombine(GenericHoodiePayload another) { + return this; + } + + @Override public IndexedRecord combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException { + return getInsertValue(schema); + } + + @Override public IndexedRecord getInsertValue(Schema schema) throws IOException { + MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema); + return jsonConverter.convert(getJsonData()); + } + + public String getRowKey(HoodieSQLStreamer.Config cfg) throws IOException, JSONException { + JSONObject object = new JSONObject(getJsonData()); + if(!object.has(cfg.keyColumnField)) { + return ""; + } + return (String) object.get(cfg.keyColumnField); + } + + public String getPartitionPath(HoodieSQLStreamer.Config cfg) throws IOException, JSONException { + JSONObject object = new JSONObject(getJsonData()); + if(!object.has(cfg.partitionPathField)) { + return "0000/00/00"; + } + return object.getString(cfg.partitionPathField); + } + + private String getJsonData() throws IOException { + return unCompressData(jsonDataCompressed); + } + + private byte[] compressData(String jsonData) throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + Deflater deflater = new Deflater(Deflater.BEST_COMPRESSION); + DeflaterOutputStream dos = + new DeflaterOutputStream(baos, deflater, true); + try { + dos.write(jsonData.getBytes()); + } finally { + dos.flush(); + dos.close(); + // Its important to call this. + // Deflater takes off-heap native memory and does not release until GC kicks in + deflater.end(); + } + return baos.toByteArray(); + } + + + private String unCompressData(byte[] data) throws IOException { + InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data)); + try { + StringWriter sw = new StringWriter(dataSize); + IOUtils.copy(iis, sw); + return sw.toString(); + } finally { + iis.close(); + } + } + +} diff --git a/pom.xml b/pom.xml new file mode 100644 index 000000000..52c19f357 --- /dev/null +++ b/pom.xml @@ -0,0 +1,395 @@ + + + + + 4.0.0 + + com.uber.hoodie + hoodie + pom + 0.2.5-SNAPSHOT + + hoodie-common + hoodie-client + hoodie-cli + + + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + + + Uber Technologies Inc. + http://www.uber.com/ + + + + + vinoth + Vinoth Chandar + Uber + + + prasanna + Prasanna Rajaperumal + Uber + + + + + + Wei Yan + Uber + + + + 2015-2016 + + + com.google.code.gson + gson + 2.3.1 + test + + + junit + junit + ${junit.version} + + + + + 2.10 + 2.6 + 2.19.1 + 1.7.0 + 4.11 + 1.9.5 + 1.2.17 + 5.7.2 + 2.6.0 + 1.1.0 + 3.1.1 + 1.5.1 + + + + scm:git:git@github.com:uber/hoodie.git + scm:git:git@github.com:uber/hoodie.git + git@github.com:uber/hoodie.git + HEAD + + + + + User List + hoodie-user@googlegroups.com + https://groups.google.com/d/forum/hoodie-user/ + + + Developer List + hoodie-dev@googlegroups.com + https://groups.google.com/d/forum/hoodie-dev/ + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.7 + 1.7 + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.3 + + + + + + maven-dependency-plugin + ${maven-dependency-plugin.version} + + + maven-jar-plugin + ${maven-jar-plugin.version} + + + maven-surefire-plugin + ${maven-surefire-plugin.version} + + + org.codehaus.mojo + cobertura-maven-plugin + 2.7 + + + html + xml + + + + + test + + cobertura + + + + + + + org.apache.rat + apache-rat-plugin + 0.11 + + + **/.* + **/*.txt + **/*.sh + + + + + package + + check + + + + + + + + + + + + + com.beust + jcommander + 1.48 + + + + log4j + log4j + ${log4j.version} + + + + org.apache.hadoop + hadoop-client + ${hadoop.version} + + + + org.apache.parquet + parquet-avro + ${parquet.version} + + + + org.apache.parquet + parquet-hadoop + ${parquet.version} + + + + + com.google.guava + guava + 15.0 + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-hdfs + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-auth + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hive + hive-common + ${hive.version}-cdh${cdh.version} + provided + + + + + + + + + org.apache.hadoop + hadoop-mapreduce-client-core + ${hadoop.version}-cdh${cdh.version} + provided + + + org.apache.hadoop + hadoop-mapreduce-client-common + 2.6.0-cdh5.7.2 + provided + + + org.apache.hive + hive-exec + 1.1.0-cdh5.7.2 + provided + + + commons-logging + commons-logging + 1.2 + + + + + + + com.twitter + parquet-hadoop-bundle + 1.5.0-cdh5.7.2 + + + com.twitter + parquet-hive-bundle + 1.5.0 + + + + org.apache.parquet + parquet-hive-bundle + 1.7.0 + + + + org.apache.spark + spark-core_2.10 + ${spark.version} + provided + + + + org.apache.hbase + hbase-client + 1.0.0 + + + + org.apache.avro + avro + 1.7.6-cdh5.7.2 + + + org.slf4j + slf4j-api + + + + + + + io.dropwizard.metrics + metrics-graphite + ${metrics.version} + + + io.dropwizard.metrics + metrics-core + ${metrics.version} + + + + xerces + xercesImpl + 2.9.1 + + + xalan + xalan + 2.7.1 + + + + commons-dbcp + commons-dbcp + 1.4 + + + org.apache.httpcomponents + httpcore + 4.3.2 + + + org.slf4j + slf4j-api + 1.7.5 + + + org.apache.commons + commons-configuration2 + 2.1 + + + + com.fasterxml.jackson.core + jackson-annotations + 2.6.0 + + + org.codehaus.jackson + jackson-mapper-asl + 1.9.13 + + + + + + + + + cloudera-repo-releases + https://repository.cloudera.com/artifactory/repo/ + + + +