发布网友 发布时间:2022-04-22 13:36
共3个回答
热心网友 时间:2023-07-14 20:13
//Java爬虫demo
import java.io.File;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DownMM {
public static void main(String[] args) throws Exception {
//out为输出的路径,注意要以\\结尾
String out = "D:\\JSP\\pic\\java\\";
try{
File f = new File(out);
if(! f.exists()) {
f.mkdirs();
}
}catch(Exception e){
System.out.println("no");
}
String url = "http://www.mzitu.com/share/comment-page-";
Pattern reg = Pattern.compile("<img src=\"(.*?)\"");
for(int j=0, i=1; i<=10; i++){
URL uu = new URL(url+i);
URLConnection conn = uu.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW; Trident/7.0; rv:11.0) like Gecko");
Scanner sc = new Scanner(conn.getInputStream());
Matcher m = reg.matcher(sc.useDelimiter("\\A").next());
while(m.find()){
Files.copy(new URL(m.group(1)).openStream(), Paths.get(out + UUID.randomUUID() + ".jpg"));
System.out.println("已下载:"+j++);
}
}
}
}
热心网友 时间:2023-07-14 20:13
//Java爬虫demo
import java.io.File;
import java.net.URL;
import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Scanner;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DownMM {
public static void main(String[] args) throws Exception {
//out为输出的路径,注意要以\\结尾
String out = "D:\\JSP\\pic\\java\\";
try{
File f = new File(out);
if(! f.exists()) {
f.mkdirs();
}
}catch(Exception e){
System.out.println("no");
}
String url = "http://www.91mntu.com/share/comment-page-";
Pattern reg = Pattern.compile("<img src=\"(.*?)\"");
for(int j=0, i=1; i<=10; i++){
URL uu = new URL(url+i);
URLConnection conn = uu.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.3; WOW; Trident/7.0; rv:11.0) like Gecko");
Scanner sc = new Scanner(conn.getInputStream());
Matcher m = reg.matcher(sc.useDelimiter("\\A").next());
while(m.find()){
Files.copy(new URL(m.group(1)).openStream(), Paths.get(out + UUID.randomUUID() + ".jpg"));
System.out.println("已下载:"+j++);
}
}
}
}
热心网友 时间:2023-07-14 20:14
package com.spider;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class Spider {
// 从一个URL加载一个Document
public Document loadDocDataUrl(String url) {
Document doc = null;
try {
// 使用 Jsoup.connect(String url)方法
doc = Jsoup.connect(url).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return doc;
}
// 你有一个HTML文档要从中提取数据,并了解这个HTML文档的结构。
public List<String> parserDoc(Document doc) {
// 将HTML解析成一个Document之后,就可以使用类似于DOM的方法进行操作。
List<String> list = new ArrayList<>();
Elements elements = doc.getElementsByClass("news_top");
Elements links = elements.get(0).getElementsByTag("a");
for (int i = 0; i < links.size(); i++) {
list.add(links.get(i).attr("href"));
}
return list;
}
public News parseDetail(Document doc){
// List<News> list = new ArrayList<>();
String title = doc.getElementsByClass("main-title").text();
String publishDate = doc.select(".date-sourse > .date").text();
String article = doc.getElementsByClass("article").text();
String keywords = doc.getElementsByClass("keywords").text();
String author = doc.getElementsByClass("show_author").text();
News news = new News();
news.setTitle(title);
news.setContent(article);
news.setKeywords(keywords);
news.setPublishDate(publishDate);
news.setAuthor(author);
return news;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
Spider spider = new Spider();
Document doc = spider.loadDocDataUrl("http://www.sina.com.cn");
List<String> list = spider.parserDoc(doc);
for (String url : list) {
Document detailDoc = spider.loadDocDataUrl(url);
News news = spider.parseDetail(detailDoc);
System.out.println(news);
}
}
}