Java源码 实现网络爬虫?

发布网友 发布时间:2022-04-22 13:36

我来回答

3个回答

热心网友 时间:2023-07-14 20:13

//Java爬虫demo
 
import java.io.File;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class DownMM {
    public static void main(String[] args) throws Exception {
        //out为输出的路径,注意要以\\结尾
        String out = "D:\\JSP\\pic\\java\\"; 
        try{
            File f = new File(out);
            if(! f.exists()) {  
                f.mkdirs();  
            }  
        }catch(Exception e){
            System.out.println("no");
        }
        
        String url = "http://www.mzitu.com/share/comment-page-";
        Pattern reg = Pattern.compile("<img src=\"(.*?)\"");
        for(int j=0, i=1; i<=10; i++){
            URL uu = new URL(url+i);
            URLConnection conn = uu.openConnection();
            conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW; Trident/7.0; rv:11.0) like Gecko");
            Scanner sc = new Scanner(conn.getInputStream());
            Matcher m = reg.matcher(sc.useDelimiter("\\A").next());
            while(m.find()){
                Files.copy(new URL(m.group(1)).openStream(), Paths.get(out + UUID.randomUUID() + ".jpg"));
                System.out.println("已下载:"+j++);
            }
        }
    }
}

热心网友 时间:2023-07-14 20:13

//Java爬虫demo

import java.io.File;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DownMM {
public static void main(String[] args) throws Exception {
//out为输出的路径,注意要以\\结尾
String out = "D:\\JSP\\pic\\java\\";
try{
File f = new File(out);
if(! f.exists()) {
f.mkdirs();
}
}catch(Exception e){
System.out.println("no");
}

String url = "http://www.91mntu.com/share/comment-page-";
Pattern reg = Pattern.compile("<img src=\"(.*?)\"");
for(int j=0, i=1; i<=10; i++){
URL uu = new URL(url+i);
URLConnection conn = uu.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW; Trident/7.0; rv:11.0) like Gecko");
Scanner sc = new Scanner(conn.getInputStream());
Matcher m = reg.matcher(sc.useDelimiter("\\A").next());
while(m.find()){
Files.copy(new URL(m.group(1)).openStream(), Paths.get(out + UUID.randomUUID() + ".jpg"));
System.out.println("已下载:"+j++);
}
}
}
}

热心网友 时间:2023-07-14 20:14

package com.spider;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class Spider {
// 从一个URL加载一个Document
public Document loadDocDataUrl(String url) {
Document doc = null;
try {
// 使用 Jsoup.connect(String url)方法
doc = Jsoup.connect(url).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
// 你有一个HTML文档要从中提取数据,并了解这个HTML文档的结构。
public List<String> parserDoc(Document doc) {
// 将HTML解析成一个Document之后,就可以使用类似于DOM的方法进行操作。
List<String> list = new ArrayList<>();
Elements elements = doc.getElementsByClass("news_top");
Elements links = elements.get(0).getElementsByTag("a");
for (int i = 0; i < links.size(); i++) {
list.add(links.get(i).attr("href"));
}
return list;
}

public News parseDetail(Document doc){
// List<News> list = new ArrayList<>();
String title = doc.getElementsByClass("main-title").text();
String publishDate = doc.select(".date-sourse > .date").text();
String article = doc.getElementsByClass("article").text();
String keywords = doc.getElementsByClass("keywords").text();
String author = doc.getElementsByClass("show_author").text();

News news = new News();
news.setTitle(title);
news.setContent(article);
news.setKeywords(keywords);
news.setPublishDate(publishDate);
news.setAuthor(author);
return news;

}
public static void main(String[] args) {
// TODO Auto-generated method stub
Spider spider = new Spider();
Document doc = spider.loadDocDataUrl("http://www.sina.com.cn");
List<String> list = spider.parserDoc(doc);
for (String url : list) {
Document detailDoc = spider.loadDocDataUrl(url);
News news = spider.parseDetail(detailDoc);
System.out.println(news);
}
}
}

声明声明:本网页内容为用户发布,旨在传播知识,不代表本网认同其观点,若有侵权等问题请及时与本网联系,我们将在第一时间删除处理。E-MAIL:11247931@qq.com