File tree Expand file tree Collapse file tree 3 files changed +27
-15
lines changed 
java/org/metafacture/html 
test/java/org/metafacture/html Expand file tree Collapse file tree 3 files changed +27
-15
lines changed Original file line number Diff line number Diff line change 3030import  org .metafacture .framework .helpers .DefaultObjectPipe ;
3131
3232/** 
33-  * Extracts the first script  from an HTML document 
33+  * Extracts the the specified element  from an HTML document 
3434 * 
3535 * @author Fabian Steeg 
3636 */ 
37- @ Description ("Extracts the first script  from an HTML document" )
37+ @ Description ("Extracts the specified element  from an HTML document" )
3838@ In (Reader .class )
3939@ Out (String .class )
40- @ FluxCommand ("extract-script" )
41- public  class  ScriptExtractor  extends  DefaultObjectPipe <Reader , ObjectReceiver <String >> {
40+ @ FluxCommand ("extract-element" )
41+ public  class  ElementExtractor  extends  DefaultObjectPipe <Reader , ObjectReceiver <String >> {
42+     private  String  selector ;
43+ 
44+     /** 
45+      * @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax 
46+      */ 
47+     public  ElementExtractor (final  String  selector ) {
48+         this .selector  = selector ;
49+     }
50+ 
4251    @ Override 
4352    public  void  process (final  Reader  reader ) {
4453        try  {
4554            Document  document  = Jsoup .parse (IOUtils .toString (reader ));
46-             Element  firstScript  = document .select ("script" ).first ();
47-             getReceiver ().process (firstScript .data ());
55+             Element  firstElement  = document .select (selector ).first ();
56+             getReceiver ().process (firstElement .data ());
4857        } catch  (IOException  e ) {
4958            e .printStackTrace ();
5059        }
Original file line number Diff line number Diff line change 1414#  limitations under the License.
1515# 
1616decode-html  org.metafacture.html.HtmlDecoder 
17- extract-script   org.metafacture.html.ScriptExtractor  
17+ extract-element   org.metafacture.html.ElementExtractor  
Original file line number Diff line number Diff line change 2828import  org .mockito .MockitoAnnotations ;
2929
3030/** 
31-  * Tests for {@link ScriptExtractor }. 
31+  * Tests for {@link ElementExtractor }. 
3232 * 
3333 * @author Fabian Steeg 
3434 * 
3535 */ 
36- public  final  class  ScriptExtractorTest  {
36+ public  final  class  ElementExtractorTest  {
3737
38-     private  static  final  StringReader  IN  = new  StringReader ("<html><script>{\" code\" :\" yo\" }" );
38+     private  static  final  StringReader  IN  = new  StringReader ("<html>" 
39+             + "<script data-test='site-head-data'>{\" code\" :\" hey\" }</script>" 
40+             + "<script data-test='model-linked-data'>{\" code\" :\" yo\" }" );
41+     
3942    private  static  final  String  OUT  = "{\" code\" :\" yo\" }" ;
4043
41-     private  ScriptExtractor   scriptExtractor ;
44+     private  ElementExtractor   elementExtractor ;
4245
4346    @ Mock 
4447    private  ObjectReceiver <String > receiver ;
4548
4649    @ Before 
4750    public  void  setup () {
4851        MockitoAnnotations .initMocks (this );
49-         scriptExtractor  = new  ScriptExtractor ( );
50-         scriptExtractor .setReceiver (receiver );
52+         elementExtractor  = new  ElementExtractor ( "script[data-test=model-linked-data]" );
53+         elementExtractor .setReceiver (receiver );
5154    }
5255
5356    @ Test 
5457    public  void  testShouldProcessRecordsFollowedbySeparator () {
55-         scriptExtractor .process (IN );
58+         elementExtractor .process (IN );
5659        verify (receiver ).process (OUT );
5760        verifyNoMoreInteractions (receiver );
5861    }
5962
6063    @ After 
6164    public  void  cleanup () {
62-         scriptExtractor .closeStream ();
65+         elementExtractor .closeStream ();
6366    }
6467}
 
 
   
 
     
   
   
          
    
    
     
    
      
     
     
    You can’t perform that action at this time.
  
 
    
  
    
      
        
     
       
      
     
   
 
    
    
  
 
  
 
     
    
0 commit comments